In [81]:
import pandas as pd
import numpy as np
import json
import ast
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sentence_transformers import SentenceTransformer
import re

  from .autonotebook import tqdm as notebook_tqdm


In [61]:
df = pd.read_csv('../datasets/leetcode_full_metadata.csv')

In [62]:
df.head()

Unnamed: 0,title,slug,frontendQuestionId,acRate,difficulty,paidOnly_basic,isFavor,status,likes,dislikes,isPaidOnly_detail,sampleTestCase,topicTags_json,stats_parsed_json,similarQuestions_parsed_json,content_preview,num_discussions,first_discussion_title,first_discussion_url,fetched_at
0,Convert Number Words to Digits,convert-number-words-to-digits,4148,,{'level': 2},True,False,,3,1,True,"""onefourthree""",[],"{""totalAccepted"": ""170"", ""totalSubmission"": ""2...",[],,0,,,2025-11-25T14:57:13.678988Z
1,Evaluate Valid Expressions,evaluate-valid-expressions,4140,,{'level': 3},True,False,,3,1,True,"""add(2,3)""","[{""name"": ""Hash Table"", ""slug"": ""hash-table"", ...","{""totalAccepted"": ""241"", ""totalSubmission"": ""3...","[{""title"": ""Basic Calculator II"", ""titleSlug"":...",,0,,,2025-11-25T14:57:13.678988Z
2,Concatenate Non-Zero Digits and Multiply by Su...,concatenate-non-zero-digits-and-multiply-by-su...,4136,,{'level': 2},False,False,,29,7,False,"""10203004""\n[[0,7],[1,3],[4,6]]",[],"{""totalAccepted"": ""6.6K"", ""totalSubmission"": ""...",[],You are given a string s of length m consistin...,0,,,2025-11-25T14:57:13.678988Z
3,Concatenate Non-Zero Digits and Multiply by Sum I,concatenate-non-zero-digits-and-multiply-by-sum-i,4135,,{'level': 1},False,False,,13,0,False,10203004,[],"{""totalAccepted"": ""27K"", ""totalSubmission"": ""4...",[],You are given an integer n. Form a new integer...,0,,,2025-11-25T14:57:13.678988Z
4,Number of Effective Subsequences,number-of-effective-subsequences,4134,,{'level': 3},False,False,,11,2,False,"[1,2,3]",[],"{""totalAccepted"": ""1.5K"", ""totalSubmission"": ""...",[],You are given an integer array nums. The stren...,0,,,2025-11-25T14:57:13.678988Z


In [None]:
cols_to_drop = [
    "title",
    "acRate",
    "paidOnly_basic",
    "isFavor",
    "status",
    "isPaidOnly_detail",
    "sampleTestCase",
    "similarQuestions_parsed_json",
    "first_discussion_title",
    "first_discussion_url",
    "fetched_at",
    "content_preview"
]

df.drop(columns=cols_to_drop, errors="ignore", inplace=True)
df.head()

Unnamed: 0,slug,frontendQuestionId,difficulty,likes,dislikes,topicTags_json,stats_parsed_json,num_discussions
0,convert-number-words-to-digits,4148,{'level': 2},3,1,[],"{""totalAccepted"": ""170"", ""totalSubmission"": ""2...",0
1,evaluate-valid-expressions,4140,{'level': 3},3,1,"[{""name"": ""Hash Table"", ""slug"": ""hash-table"", ...","{""totalAccepted"": ""241"", ""totalSubmission"": ""3...",0
2,concatenate-non-zero-digits-and-multiply-by-su...,4136,{'level': 2},29,7,[],"{""totalAccepted"": ""6.6K"", ""totalSubmission"": ""...",0
3,concatenate-non-zero-digits-and-multiply-by-sum-i,4135,{'level': 1},13,0,[],"{""totalAccepted"": ""27K"", ""totalSubmission"": ""4...",0
4,number-of-effective-subsequences,4134,{'level': 3},11,2,[],"{""totalAccepted"": ""1.5K"", ""totalSubmission"": ""...",0


In [None]:
def extract_slugs(val):
    if pd.isna(val):
        return []

    if isinstance(val, list):
        parsed = val
    elif isinstance(val, dict):
        parsed = [val]
    elif isinstance(val, str):
        s = val.strip()
        if s.lower() in {"", "nan", "none", "null"}:
            return []

        for parser in (json.loads, ast.literal_eval):
            try:
                parsed = parser(s)
                break
            except Exception:
                parsed = None
        if parsed is None:

            try:
                parsed = json.loads(s.replace("'", '"'))
            except Exception:
                return []
    else:
        return []

    if isinstance(parsed, dict):
        parsed = [parsed]
    if not isinstance(parsed, list):
        return []

    slugs = []
    for d in parsed:
        if isinstance(d, dict) and "slug" in d:
            slugs.append(d["slug"])
    return slugs

df['topicTags_slugs'] = df['topicTags_json'].apply(extract_slugs)


In [65]:
df.head()

Unnamed: 0,slug,frontendQuestionId,difficulty,likes,dislikes,topicTags_json,stats_parsed_json,num_discussions,topicTags_slugs
0,convert-number-words-to-digits,4148,{'level': 2},3,1,[],"{""totalAccepted"": ""170"", ""totalSubmission"": ""2...",0,[]
1,evaluate-valid-expressions,4140,{'level': 3},3,1,"[{""name"": ""Hash Table"", ""slug"": ""hash-table"", ...","{""totalAccepted"": ""241"", ""totalSubmission"": ""3...",0,"[hash-table, math, string, divide-and-conquer,..."
2,concatenate-non-zero-digits-and-multiply-by-su...,4136,{'level': 2},29,7,[],"{""totalAccepted"": ""6.6K"", ""totalSubmission"": ""...",0,[]
3,concatenate-non-zero-digits-and-multiply-by-sum-i,4135,{'level': 1},13,0,[],"{""totalAccepted"": ""27K"", ""totalSubmission"": ""4...",0,[]
4,number-of-effective-subsequences,4134,{'level': 3},11,2,[],"{""totalAccepted"": ""1.5K"", ""totalSubmission"": ""...",0,[]


In [68]:
valid_lists = df['topicTags_slugs'].apply(lambda x: x if isinstance(x, list) else [])

mlb = MultiLabelBinarizer()
tag_matrix = mlb.fit_transform(valid_lists)

tag_cols = ['tag_' + c.replace('-', '_') for c in mlb.classes_]

tags_df = pd.DataFrame(tag_matrix, columns=tag_cols, index=df.index)
df = pd.concat([df, tags_df], axis=1)
df[tag_cols] = df[tag_cols].astype(int)

print(f"Multihot encoded {len(tag_cols)} tags. Example columns: {tag_cols[:10]}")

Multihot encoded 72 tags. Example columns: ['tag_array', 'tag_backtracking', 'tag_biconnected_component', 'tag_binary_indexed_tree', 'tag_binary_search', 'tag_binary_search_tree', 'tag_binary_tree', 'tag_bit_manipulation', 'tag_bitmask', 'tag_brainteaser']


In [69]:
df.head()

Unnamed: 0,slug,frontendQuestionId,difficulty,likes,dislikes,topicTags_json,stats_parsed_json,num_discussions,topicTags_slugs,tag_array,...,tag_stack,tag_string,tag_string_matching,tag_strongly_connected_component,tag_suffix_array,tag_topological_sort,tag_tree,tag_trie,tag_two_pointers,tag_union_find
0,convert-number-words-to-digits,4148,{'level': 2},3,1,[],"{""totalAccepted"": ""170"", ""totalSubmission"": ""2...",0,[],0,...,0,0,0,0,0,0,0,0,0,0
1,evaluate-valid-expressions,4140,{'level': 3},3,1,"[{""name"": ""Hash Table"", ""slug"": ""hash-table"", ...","{""totalAccepted"": ""241"", ""totalSubmission"": ""3...",0,"[hash-table, math, string, divide-and-conquer,...",0,...,1,1,0,0,0,0,0,0,0,0
2,concatenate-non-zero-digits-and-multiply-by-su...,4136,{'level': 2},29,7,[],"{""totalAccepted"": ""6.6K"", ""totalSubmission"": ""...",0,[],0,...,0,0,0,0,0,0,0,0,0,0
3,concatenate-non-zero-digits-and-multiply-by-sum-i,4135,{'level': 1},13,0,[],"{""totalAccepted"": ""27K"", ""totalSubmission"": ""4...",0,[],0,...,0,0,0,0,0,0,0,0,0,0
4,number-of-effective-subsequences,4134,{'level': 3},11,2,[],"{""totalAccepted"": ""1.5K"", ""totalSubmission"": ""...",0,[],0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
def parse_stats(val):
    if pd.isna(val):
        return {'totalAccepted': None, 'totalSubmission': None, 'acRate': None}

    # parse to dict
    if isinstance(val, dict):
        parsed = val
    else:
        s = str(val).strip()
        if s.lower() in {"", "nan", "none", "null"}:
            return {'totalAccepted': None, 'totalSubmission': None, 'acRate': None}
        parsed = None
        for parser in (json.loads, ast.literal_eval):
            try:
                parsed = parser(s)
                break
            except Exception:
                parsed = None

        if parsed is None:

            ta = re.search(r'totalAcceptedRaw"\s*[:=]\s*([0-9]+)|totalAccepted"\s*[:=]\s*["\']?([0-9]+)', s)
            ts = re.search(r'totalSubmissionRaw"\s*[:=]\s*([0-9]+)|totalSubmission"\s*[:=]\s*["\']?([0-9]+)', s)
            ac = re.search(r'acRate"\s*[:=]\s*["\']?([0-9.]+)%?', s)
            totalAccepted = int(ta.group(1) or ta.group(2)) if ta else None
            totalSubmission = int(ts.group(1) or ts.group(2)) if ts else None
            ac_val = float(ac.group(1)) if ac else None
            return {'totalAccepted': totalAccepted, 'totalSubmission': totalSubmission, 'acRate': ac_val}


    totalAccepted = parsed.get('totalAcceptedRaw') if parsed.get('totalAcceptedRaw') is not None else parsed.get('totalAccepted')
    totalSubmission = parsed.get('totalSubmissionRaw') if parsed.get('totalSubmissionRaw') is not None else parsed.get('totalSubmission')
    ac_val = parsed.get('acRate') or parsed.get('acrate') or parsed.get('ac') or None


    try:
        if isinstance(totalAccepted, str):
            totalAccepted = totalAccepted.strip()
        totalAccepted = int(totalAccepted) if totalAccepted is not None else None
    except Exception:
        totalAccepted = None

    try:
        if isinstance(totalSubmission, str):
            totalSubmission = totalSubmission.strip()
        totalSubmission = int(totalSubmission) if totalSubmission is not None else None
    except Exception:
        totalSubmission = None


    ac_num = None
    if ac_val is not None:
        try:
            ac_s = str(ac_val).strip()
            if ac_s.endswith('%'):
                ac_num = float(ac_s.rstrip('%'))
            else:
                ac_num = float(ac_s)
                if 0 < ac_num <= 1:
                    ac_num = ac_num * 100
        except Exception:
            ac_num = None

    return {'totalAccepted': totalAccepted, 'totalSubmission': totalSubmission, 'acRate': ac_num}


stats_df = df['stats_parsed_json'].apply(parse_stats).apply(pd.Series)
df = pd.concat([df, stats_df], axis=1)


df['totalAccepted'] = pd.to_numeric(df['totalAccepted'], errors='coerce').astype('Int64')
df['totalSubmission'] = pd.to_numeric(df['totalSubmission'], errors='coerce').astype('Int64')
df['acRate'] = pd.to_numeric(df['acRate'], errors='coerce')


In [71]:
df.head()

Unnamed: 0,slug,frontendQuestionId,difficulty,likes,dislikes,topicTags_json,stats_parsed_json,num_discussions,topicTags_slugs,tag_array,...,tag_strongly_connected_component,tag_suffix_array,tag_topological_sort,tag_tree,tag_trie,tag_two_pointers,tag_union_find,totalAccepted,totalSubmission,acRate
0,convert-number-words-to-digits,4148,{'level': 2},3,1,[],"{""totalAccepted"": ""170"", ""totalSubmission"": ""2...",0,[],0,...,0,0,0,0,0,0,0,170,213,79.8
1,evaluate-valid-expressions,4140,{'level': 3},3,1,"[{""name"": ""Hash Table"", ""slug"": ""hash-table"", ...","{""totalAccepted"": ""241"", ""totalSubmission"": ""3...",0,"[hash-table, math, string, divide-and-conquer,...",0,...,0,0,0,0,0,0,0,241,312,77.2
2,concatenate-non-zero-digits-and-multiply-by-su...,4136,{'level': 2},29,7,[],"{""totalAccepted"": ""6.6K"", ""totalSubmission"": ""...",0,[],0,...,0,0,0,0,0,0,0,6604,30071,22.0
3,concatenate-non-zero-digits-and-multiply-by-sum-i,4135,{'level': 1},13,0,[],"{""totalAccepted"": ""27K"", ""totalSubmission"": ""4...",0,[],0,...,0,0,0,0,0,0,0,27032,49464,54.6
4,number-of-effective-subsequences,4134,{'level': 3},11,2,[],"{""totalAccepted"": ""1.5K"", ""totalSubmission"": ""...",0,[],0,...,0,0,0,0,0,0,0,1506,5585,27.0


In [72]:
df.drop(columns=['topicTags_json', 'stats_parsed_json'], inplace=True)

In [73]:
df['frontendQuestionId'] = pd.to_numeric(df['frontendQuestionId'], errors='coerce').astype('Int64')
df.sort_values('frontendQuestionId', inplace=True, na_position='last')
df.reset_index(drop=True, inplace=True)

df.head()

Unnamed: 0,slug,frontendQuestionId,difficulty,likes,dislikes,num_discussions,topicTags_slugs,tag_array,tag_backtracking,tag_biconnected_component,...,tag_strongly_connected_component,tag_suffix_array,tag_topological_sort,tag_tree,tag_trie,tag_two_pointers,tag_union_find,totalAccepted,totalSubmission,acRate
0,two-sum,1,{'level': 1},65658,2439,0,"[array, hash-table]",1,0,0,...,0,0,0,0,0,0,0,19615209,34663654,56.6
1,add-two-numbers,2,{'level': 2},35413,7052,0,"[linked-list, math, recursion]",0,0,0,...,0,0,0,0,0,0,0,6444446,13607857,47.4
2,longest-substring-without-repeating-characters,3,{'level': 2},43655,2136,0,"[hash-table, string, sliding-window]",0,0,0,...,0,0,0,0,0,0,0,8510573,22434952,37.9
3,median-of-two-sorted-arrays,4,{'level': 3},31279,3511,0,"[array, binary-search, divide-and-conquer]",1,0,0,...,0,0,0,0,0,0,0,3853209,8518993,45.2
4,longest-palindromic-substring,5,{'level': 2},31858,1966,0,"[two-pointers, string, dynamic-programming]",0,0,0,...,0,0,0,0,0,1,0,4321306,11746175,36.8


In [74]:
df.drop(columns=['topicTags_slugs'], inplace=True)

In [75]:
df_validation = df[['frontendQuestionId', 'difficulty']].copy()


df_validation['frontendQuestionId'] = pd.to_numeric(df_validation['frontendQuestionId'], errors='coerce')
df_validation.dropna(subset=['frontendQuestionId'], inplace=True)
df_validation['frontendQuestionId'] = df_validation['frontendQuestionId'].astype(int)


df_validation.to_csv('../datasets/validation.csv', index=False)


df.set_index('frontendQuestionId', inplace=True)
df.drop(columns=['difficulty'], inplace=True)

df.head()

Unnamed: 0_level_0,slug,likes,dislikes,num_discussions,tag_array,tag_backtracking,tag_biconnected_component,tag_binary_indexed_tree,tag_binary_search,tag_binary_search_tree,...,tag_strongly_connected_component,tag_suffix_array,tag_topological_sort,tag_tree,tag_trie,tag_two_pointers,tag_union_find,totalAccepted,totalSubmission,acRate
frontendQuestionId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,two-sum,65658,2439,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,19615209,34663654,56.6
2,add-two-numbers,35413,7052,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,6444446,13607857,47.4
3,longest-substring-without-repeating-characters,43655,2136,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,8510573,22434952,37.9
4,median-of-two-sorted-arrays,31279,3511,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,3853209,8518993,45.2
5,longest-palindromic-substring,31858,1966,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,4321306,11746175,36.8


In [78]:
df.describe()

Unnamed: 0,likes,dislikes,num_discussions,tag_array,tag_backtracking,tag_biconnected_component,tag_binary_indexed_tree,tag_binary_search,tag_binary_search_tree,tag_binary_tree,...,tag_strongly_connected_component,tag_suffix_array,tag_topological_sort,tag_tree,tag_trie,tag_two_pointers,tag_union_find,totalAccepted,totalSubmission,acRate
count,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,...,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0,3758.0
mean,2102.453167,307.819585,0.0,0.544439,0.029537,0.000266,0.011442,0.086482,0.010644,0.0471,...,0.000798,0.002129,0.009846,0.067589,0.0157,0.061203,0.025812,278419.02049,488489.02049,56.821953
std,4024.63088,879.988531,0.0,0.498088,0.169329,0.016313,0.106369,0.281112,0.102633,0.21188,...,0.028247,0.046096,0.098749,0.251073,0.124328,0.239734,0.158594,734960.07897,1373250.80136,16.979137
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,170.0,213.0,9.9
25%,172.0,22.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19056.0,43502.0,44.8
50%,673.0,79.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,68315.5,117891.0,57.1
75%,2086.75,261.75,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,212253.0,336931.75,69.2
max,65658.0,20264.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,19615209.0,34663654.0,94.8


In [None]:
continuous_cols = ["likes", "dislikes", "num_discussions", 
                   "totalAccepted", "totalSubmission", "acRate"]

scaler = StandardScaler()
df[continuous_cols] = scaler.fit_transform(df[continuous_cols])

In [82]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

def preprocess_slug(text):

    text = text.replace("-", " ").replace("_", " ")

    return f"leetcode problem: {text}"

df['slug_clean'] = df['slug'].apply(preprocess_slug)

In [83]:
slug_embeddings = model.encode(
    df['slug_clean'].tolist(),
    batch_size=32,
    show_progress_bar=True
)

Batches: 100%|██████████| 118/118 [00:08<00:00, 14.38it/s]


In [84]:
embed_df = pd.DataFrame(slug_embeddings, index=df.index)
embed_df.columns = [f"mpnet_{i}" for i in range(embed_df.shape[1])]

In [85]:
tag_columns = [col for col in df.columns if col.startswith("tag_")]

In [87]:
X = pd.concat([
    embed_df,
    df[tag_columns],
    df[['acRate']]
], axis=1)

print("Feature matrix shape:", X.shape)

Feature matrix shape: (3758, 841)


In [89]:
X.to_csv("../datasets/final_dataset.csv",index=False)