In [1]:
import requests

In [2]:
url = "https://raw.githubusercontent.com/alexeygrigorev/data-science-interviews/refs/heads/master/theory.md"

In [3]:
response = requests.get(url)

In [5]:
content = response.text

In [6]:
import re

In [7]:
pattern = r'(?m)^##\s.*(?:\n(?!##).*)*'
sections = re.findall(pattern, content)

In [8]:
sections

['## Table of contents\n\n* [Supervised machine learning](#supervised-machinelearning)\n* [Linear regression](#linear-regression)\n* [Validation](#validation)\n* [Classification](#classification)\n* [Regularization](#regularization)\n* [Feature selection](#feature-selection)\n* [Decision trees](#decision-trees)\n* [Random forest](#random-forest)\n* [Gradient boosting](#gradient-boosting)\n* [Parameter tuning](#parameter-tuning)\n* [Neural networks](#neural-networks)\n* [Optimization in neural networks](#optimization-in-neuralnetworks)\n* [Neural networks for computer vision](#neural-networks-for-computervision)\n* [Text classification](#text-classification)\n* [Clustering](#clustering)\n* [Dimensionality reduction](#dimensionality-reduction)\n* [Ranking and search](#ranking-andsearch)\n* [Recommender systems](#recommender-systems)\n* [Time series](#time-series)\n\n<br/>\n',
 '## Supervised machine\xa0learning\n\n**What is supervised machine learning? üë∂**\n\nSupervised learning is a 

In [9]:
qa_pattern = r'\*\*(.*?)\*\*\n\n([\s\S]*?)(?=\n<br/>\n|\Z)'

In [14]:
qa = []

In [15]:
for section in sections:
    qa_pairs = re.findall(qa_pattern, section)
    qa.extend(qa_pairs)


[('What is supervised machine learning? üë∂', 'Supervised learning is a type of machine learning in which our algorithms are trained using well-labeled training data, and machines predict the output based on that data. Labeled data indicates that the\xa0input data has already been tagged with the appropriate output. Basically, it is the task of learning a function that maps the input set and returns an output. Some of its examples are: Linear Regression, Logistic Regression, KNN, etc.\n\nk-Nearest Neighbors(KNN):Looking at the k closest labeled data points \n'), ('What is regression? Which models can you use to solve a regression problem? üë∂', 'Regression is a part of supervised ML. Regression models investigate the relationship between a dependent (target) and independent variable (s) (predictor).\nHere are some common regression models\n\n- *Linear Regression* establishes a linear relationship between target and predictor (s). It predicts a numeric value and has a shape of a strai

In [17]:
SECTION_PATTERN = r'(?m)^##\s(.*)\n([\s\S]*?)(?=^##|\Z)'
QA_PATTERN = r'\*\*(.*?)\*\*\n\n([\s\S]*?)(?=\n<br/>\n|\Z)'

def infer_difficulty(question: str) -> str:
    if "üë∂" in question:
        return "easy"
    if "‚≠ê" in question:
        return "hard"
    return "medium"

results = []

for section_title, section_body in re.findall(SECTION_PATTERN, content):
    topic = section_title.strip().lower().replace(" ", "_")

    for question, answer in re.findall(QA_PATTERN, section_body):
        results.append({
            "text": question.strip(),
            "reference_answer": answer.strip(),
            "topic": topic,
            "difficulty": infer_difficulty(question)
        })

In [18]:
results

[{'text': 'What is supervised machine learning? üë∂',
  'reference_answer': 'Supervised learning is a type of machine learning in which our algorithms are trained using well-labeled training data, and machines predict the output based on that data. Labeled data indicates that the\xa0input data has already been tagged with the appropriate output. Basically, it is the task of learning a function that maps the input set and returns an output. Some of its examples are: Linear Regression, Logistic Regression, KNN, etc.\n\nk-Nearest Neighbors(KNN):Looking at the k closest labeled data points',
  'topic': 'supervised_machine\xa0learning',
  'difficulty': 'easy'},
 {'text': 'What is regression? Which models can you use to solve a regression problem? üë∂',
  'reference_answer': 'Regression is a part of supervised ML. Regression models investigate the relationship between a dependent (target) and independent variable (s) (predictor).\nHere are some common regression models\n\n- *Linear Regress

In [19]:
len(results)

166

In [23]:
import re

def normalize_question(question: str):
    difficulty = "medium"
    if "üë∂" in question:
        difficulty = "easy"
    elif "‚≠ê" in question:
        difficulty = "hard"

    clean_question = re.sub(r"[üë∂‚≠ê]", "", question).strip()
    return clean_question, difficulty


In [24]:
def infer_question_type(question: str) -> str:
    q = question.lower()

    if any(k in q for k in ["implement", "code", "write", "python"]):
        return "coding"
    if any(k in q for k in ["derive", "prove", "formula", "equation"]):
        return "math"
    if any(k in q for k in ["how would you", "what would you do", "design"]):
        return "scenario"

    return "conceptual"


In [25]:
STOPWORDS = {
    "what", "is", "are", "the", "of", "to", "and", "when", "do", "we",
    "you", "which", "can", "use", "main"
}

def generate_tags(question: str, topic: str):
    words = re.findall(r"[a-zA-Z_]+", question.lower())
    keywords = [
        w for w in words
        if w not in STOPWORDS and len(w) > 3
    ]

    return list(set([topic] + keywords[:5]))


In [26]:
def extract_key_concepts(answer: str):
    bold_terms = re.findall(r"\*\*(.*?)\*\*", answer)
    italic_terms = re.findall(r"\*(.*?)\*", answer)

    candidates = set(bold_terms + italic_terms)

    # Light cleanup
    cleaned = [
        c.strip().lower().replace(" ", "_")
        for c in candidates
        if len(c) > 2
    ]

    return list(set(cleaned))


In [27]:
import re

SECTION_PATTERN = r'(?m)^##\s(.*)\n([\s\S]*?)(?=^##|\Z)'
QA_PATTERN = r'\*\*(.*?)\*\*\n\n([\s\S]*?)(?=\n<br/>\n|\Z)'

results = []

for section_title, section_body in re.findall(SECTION_PATTERN, content):
    topic = section_title.strip().lower().replace(" ", "_")

    for raw_question, answer in re.findall(QA_PATTERN, section_body):
        question, difficulty = normalize_question(raw_question)

        question_type = infer_question_type(question)
        tags = generate_tags(question, topic)
        key_concepts = extract_key_concepts(answer)

        results.append({
            "text": question,
            "reference_answer": answer.strip(),
            "topic": topic,
            "difficulty": difficulty,
            "question_type": question_type,
            "tags": tags,
            "key_concepts": key_concepts
        })


In [28]:
results

[{'text': 'What is supervised machine learning?',
  'reference_answer': 'Supervised learning is a type of machine learning in which our algorithms are trained using well-labeled training data, and machines predict the output based on that data. Labeled data indicates that the\xa0input data has already been tagged with the appropriate output. Basically, it is the task of learning a function that maps the input set and returns an output. Some of its examples are: Linear Regression, Logistic Regression, KNN, etc.\n\nk-Nearest Neighbors(KNN):Looking at the k closest labeled data points',
  'topic': 'supervised_machine\xa0learning',
  'difficulty': 'easy',
  'question_type': 'conceptual',
  'tags': ['supervised_machine\xa0learning',
   'supervised',
   'machine',
   'learning'],
  'key_concepts': []},
 {'text': 'What is regression? Which models can you use to solve a regression problem?',
  'reference_answer': 'Regression is a part of supervised ML. Regression models investigate the relatio

In [30]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sandy1811/data-science-interview-questions")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading to /home/alokpadhi/.cache/kagglehub/datasets/sandy1811/data-science-interview-questions/1036.archive...


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.03M/1.03M [00:01<00:00, 669kB/s]

Extracting files...
Path to dataset files: /home/alokpadhi/.cache/kagglehub/datasets/sandy1811/data-science-interview-questions/versions/1036





In [31]:
import pandas as pd

In [32]:
gpt = pd.read_csv("/home/alokpadhi/ai-interview-system/data/datasets/kaggle/chatbot_conversations.csv")

In [33]:
gpt.shape

(12996120, 5)

In [34]:
gpt.columns

Index(['conversation_id', 'turn', 'role', 'intent', 'message'], dtype='object')

In [35]:
import numpy as np

In [36]:
np.unique(gpt["intent"])

array(['ai', 'android', 'books', 'business', 'career', 'cloud', 'coding',
       'coding_errors', 'college', 'communication', 'datascience', 'dl',
       'education', 'emotions', 'entertainment', 'farewell', 'finance',
       'fitness', 'food', 'gaming', 'general', 'geography', 'greeting',
       'habits', 'health', 'history', 'interview', 'ios', 'life',
       'marketing', 'math', 'ml', 'motivation', 'motivation_daily',
       'motivation_strong', 'music', 'networking', 'news', 'philosophy',
       'productivity', 'projects', 'psychology', 'quotes', 'relationship',
       'resume', 'salary', 'science', 'security', 'shopping', 'sleep',
       'sports', 'study', 'technology', 'travel', 'weather'], dtype=object)

In [39]:
filtered_gpt = gpt[gpt['intent'].isin(["coding", "coding_errors", "datascience", "interview", "ml"])]

In [40]:
filtered_gpt.shape

(1183174, 5)

In [41]:
filtered_gpt.to_csv("gpt_data.csv")

In [10]:
str1 = "Vector Embeddings, Semantic Search, Cosine Similarity, HNSW Index, Chunking Strategies (Fixed-size, Semantic), Re-ranking (Cross-Encoders), Dense vs. Sparse Retrieval (BM25 vs. Embeddings)"

In [11]:
str1.split(",")

['Vector Embeddings',
 ' Semantic Search',
 ' Cosine Similarity',
 ' HNSW Index',
 ' Chunking Strategies (Fixed-size',
 ' Semantic)',
 ' Re-ranking (Cross-Encoders)',
 ' Dense vs. Sparse Retrieval (BM25 vs. Embeddings)']

In [1]:
import pandas as pd

In [2]:
d = pd.read_csv("/home/alokpadhi/ai-interview-system/data/datasets/raw/interview_questions/deeplearning_questions.csv")

In [3]:
d.head()

Unnamed: 0,ID,DESCRIPTION
0,1,What is padding
1,2,Sigmoid Vs Softmax
2,3,What is PoS Tagging
3,4,What is tokenization
4,5,What is topic modeling


In [4]:
d.columns

Index(['ID', 'DESCRIPTION'], dtype='object')

In [1]:
import json

In [3]:
with open("/home/alokpadhi/ai-interview-system/data/datasets/processed/interview_questions/kaggle_datascience_interview_questions.json", "r") as f:
    d = json.load(f)

In [14]:
r = []

In [12]:
import re

In [15]:
for i in d:
    i["id"] = re.sub(r"github", "kaggle", i["id"])
    r.append(i)

In [16]:
r[0]

{'text': ' What is padding',
 'difficulty': 'easy',
 'topic': 'padding',
 'question_type': 'conceptual',
 'estimated_time_minutes': 3,
 'tags': ['data-padding', 'padding', 'text-padding', 'attribute-padding'],
 'reference_answer': 'Padding refers to the process of adding extra characters or spaces to a string, often to make it a certain length or to align with other strings.',
 'key_concepts': ['space character', 'null character'],
 'source': 'kaggle_data_science_interviews',
 'id': 'kaggle_ds_interviews_0000'}

In [17]:
len(r)

1111

In [19]:
with open("/home/alokpadhi/ai-interview-system/data/datasets/processed/interview_questions/kaggle_datascience_interview_questions.json", "w") as fp:
    json.dump(r, fp, indent=2)

In [13]:
from detoxify import Detoxify

# each model takes in either a string or a list of strings

results = Detoxify('original', device='cuda').predict('example text')

In [14]:
round(float(results['toxicity']), 2)

0.0

In [12]:
results['toxicity']

np.float32(0.0006478309)

In [1]:
import json

In [2]:
with open("/home/alokpadhi/ai-interview-system/data/datasets/processed/interview_questions/github_datascience_interview_questions.json", "r") as fp:
    github_data =  json.load(fp)

In [3]:
with open("/home/alokpadhi/ai-interview-system/data/datasets/processed/interview_questions/kaggle_datascience_interview_questions.json", "r") as fp:
    kaggle_data = json.load(fp)

In [4]:
final_data = github_data + kaggle_data

In [5]:
len(final_data)

1277

In [6]:
final_data[-1]

{'text': 'What are the key differences between classification and regression? (Sample 1000)',
 'difficulty': 'easy',
 'topic': 'Machine Learning',
 'question_type': 'conceptual',
 'estimated_time_minutes': 3,
 'tags': ['machine_learning',
  'machine learning',
  'between',
  'supervised learning',
  'regression',
  'differences',
  'sample',
  'classification',
  'unsupervised learning'],
 'reference_answer': {'classification': 'Classification is used for predicting categorical outcomes, whereas regression is used for predicting continuous outcomes.',
  'regression': 'Regression is a type of supervised learning algorithm that predicts continuous outcomes.'},
 'key_concepts': ['target variable',
  'prediction task',
  'continuous outcome',
  'categorical outcome'],
 'source': 'kaggle_data_science_interviews',
 'id': 'kaggle_ds_interviews_1110'}

In [7]:
with open("/home/alokpadhi/ai-interview-system/data/validation_output/questions_to_remove.json", "r") as fp:
    q_to_remove = json.load(fp)

In [32]:
q_to_remove[0]

{'id': 'github_ds_interviews_0004',
 'text': 'What‚Äôs the normal distribution? Why do we care about it?',
 'reasons': ['Irrelevant content']}

In [45]:
# required = {
#     "Duplicate question",
#     "Invalid difficulty 'NA'"
# }

# excluded = {
#     "Irrelevant content"
# }

# ids = [
#     d["id"]
#     for d in q_to_remove
#     if required.intersection(d.get("reasons", []))
#     and excluded.isdisjoint(d.get("reasons", []))
# ]

ids = [
    d["id"]
    for d in q_to_remove
    if set(d.get("reasons", [])) != {"Irrelevant content"}
]


In [46]:
len(ids)

1016

In [43]:
irrelevant_ids = [i for i in q_to_remove if "Irrelevant content" in i["reasons"]]

In [44]:
irrelevant_ids

[{'id': 'github_ds_interviews_0004',
  'text': 'What‚Äôs the normal distribution? Why do we care about it?',
  'reasons': ['Irrelevant content']},
 {'id': 'github_ds_interviews_0005',
  'text': 'How do we check if a variable follows the normal distribution? \u200dÔ∏è',
  'reasons': ['Irrelevant content']},
 {'id': 'github_ds_interviews_0009',
  'text': 'What is the normal equation? \u200dÔ∏è',
  'reasons': ['Irrelevant content']},
 {'id': 'github_ds_interviews_0016',
  'text': 'Why do we need to split our data into three parts: train, validation, and test?',
  'reasons': ['Irrelevant content']},
 {'id': 'github_ds_interviews_0017',
  'text': 'Can you explain how cross-validation works?',
  'reasons': ['Irrelevant content', 'Duplicate question']},
 {'id': 'github_ds_interviews_0018',
  'text': 'What is K-fold cross-validation?',
  'reasons': ['Irrelevant content']},
 {'id': 'github_ds_interviews_0019',
  'text': 'How do we choose K in K-fold cross-validation? What‚Äôs your favorite K?',

In [47]:
filtered_github_kaggle_data = []

In [48]:
for que in final_data:
    if que["id"] not in ids:
        filtered_github_kaggle_data.append(que)

In [49]:
len(filtered_github_kaggle_data)

261

In [50]:
filtered_github_kaggle_data[0]

{'text': 'What is supervised machine learning?',
 'difficulty': 'easy',
 'topic': 'supervised_machine\xa0learning',
 'question_type': 'conceptual',
 'estimated_time_minutes': 2,
 'tags': ['learning',
  'supervised_machine\xa0learning',
  'supervised',
  'machine'],
 'reference_answer': 'Supervised learning is a type of machine learning in which our algorithms are trained using well-labeled training data, and machines predict the output based on that data. Labeled data indicates that the\xa0input data has already been tagged with the appropriate output. Basically, it is the task of learning a function that maps the input set and returns an output. Some of its examples are: Linear Regression, Logistic Regression, KNN, etc.\n\nk-Nearest Neighbors(KNN):Looking at the k closest labeled data points \n',
 'key_concepts': [],
 'source': 'github_data_science_interviews',
 'id': 'github_ds_interviews_0000'}

In [51]:
with open("/home/alokpadhi/ai-interview-system/data/datasets/processed/interview_questions/filtered_github_kaggle_iqs.json", "w") as fp:
    json.dump(filtered_github_kaggle_data, fp, indent=2)

In [52]:
with open("/home/alokpadhi/ai-interview-system/data/validation_recheck_output/quantity/insufficient_categories.json",  'r') as fp:
    topics_to_cover = json.load(fp)

In [53]:
topics_to_cover

[{'category': 'topic',
  'name': 'supervised_machine\xa0learning',
  'current': 1,
  'required': 30,
  'deficit': 29},
 {'category': 'topic',
  'name': 'linear_regression',
  'current': 10,
  'required': 30,
  'deficit': 20},
 {'category': 'topic',
  'name': 'validation',
  'current': 4,
  'required': 30,
  'deficit': 26},
 {'category': 'topic',
  'name': 'classification',
  'current': 18,
  'required': 30,
  'deficit': 12},
 {'category': 'topic',
  'name': 'regularization',
  'current': 13,
  'required': 30,
  'deficit': 17},
 {'category': 'topic',
  'name': 'feature_selection',
  'current': 4,
  'required': 30,
  'deficit': 26},
 {'category': 'topic',
  'name': 'decision_trees',
  'current': 6,
  'required': 30,
  'deficit': 24},
 {'category': 'topic',
  'name': 'random_forest',
  'current': 8,
  'required': 30,
  'deficit': 22},
 {'category': 'topic',
  'name': 'gradient_boosting',
  'current': 6,
  'required': 30,
  'deficit': 24},
 {'category': 'topic',
  'name': 'parameter_tuning

In [54]:
topics = [{"topic": topic["name"], "questions_needed": topic["deficit"]}for topic in topics_to_cover]

In [55]:
topics

[{'topic': 'supervised_machine\xa0learning', 'questions_needed': 29},
 {'topic': 'linear_regression', 'questions_needed': 20},
 {'topic': 'validation', 'questions_needed': 26},
 {'topic': 'classification', 'questions_needed': 12},
 {'topic': 'regularization', 'questions_needed': 17},
 {'topic': 'feature_selection', 'questions_needed': 26},
 {'topic': 'decision_trees', 'questions_needed': 24},
 {'topic': 'random_forest', 'questions_needed': 22},
 {'topic': 'gradient_boosting', 'questions_needed': 24},
 {'topic': 'parameter_tuning', 'questions_needed': 28},
 {'topic': 'neural_networks', 'questions_needed': 22},
 {'topic': 'optimization_in_neural\xa0networks', 'questions_needed': 18},
 {'topic': 'neural_networks_for_computer\xa0vision', 'questions_needed': 23},
 {'topic': 'text_classification', 'questions_needed': 20},
 {'topic': 'clustering', 'questions_needed': 23},
 {'topic': 'dimensionality_reduction', 'questions_needed': 27},
 {'topic': 'ranking_and\xa0search', 'questions_needed': 21

In [56]:
average_needed = sum(i["questions_needed"] for i in topics) / len(topics)
print(f"Average questions needed: {average_needed}")

Average questions needed: 26.520408163265305


In [60]:
total_topics = [i["topic"] for i in topics]

In [61]:
total_topics

['supervised_machine\xa0learning',
 'linear_regression',
 'validation',
 'classification',
 'regularization',
 'feature_selection',
 'decision_trees',
 'random_forest',
 'gradient_boosting',
 'parameter_tuning',
 'neural_networks',
 'optimization_in_neural\xa0networks',
 'neural_networks_for_computer\xa0vision',
 'text_classification',
 'clustering',
 'dimensionality_reduction',
 'ranking_and\xa0search',
 'recommender_systems',
 'time_series',
 'padding',
 'Activation Functions',
 'Natural Language Processing',
 'NLP',
 'Topic Modeling',
 'Artificial Intelligence/Neural Networks',
 'General',
 'Machine Learning',
 'Analysis Techniques',
 'Optimization Techniques',
 'Convex Optimization',
 'Information Retrieval',
 'Deep Learning Fundamentals',
 'Optimization Techniques for Online Learning',
 'Neural Networks',
 'Linear Algebra and Matrix Operations',
 'Word Embeddings',
 'Batch Size Selection',
 'Optimization Algorithms',
 'Deep Learning Optimization',
 'ReLU Activation Function and Dy

In [59]:
98*26

2548

In [84]:
files = [
        "filtered_github_kaggle_iqs.json",
        "interview_questions_chip.json",
        "interview_questions.json",
        "leetcode_questions.json",
        "llm_generated_iqs.json",
        "system_design_iqs.json"
    ]

In [85]:
from pathlib import Path

In [86]:
base_dir = Path(Path.cwd()).parent

In [87]:
base_dir

PosixPath('/home/alokpadhi/ai-interview-system')

In [88]:
datasets_path = base_dir / "data/datasets/processed/interview_questions"

In [89]:
datasets_path

PosixPath('/home/alokpadhi/ai-interview-system/data/datasets/processed/interview_questions')

In [90]:
final_data = []

In [91]:
def load_json(file):
    with open(file, "r") as fp:
        data = json.load(fp)

    return data

In [92]:
for file in files:
    print(f"  - Loading {file}...")
    data = load_json(datasets_path / file)
    final_data.extend(data)

  - Loading filtered_github_kaggle_iqs.json...
  - Loading interview_questions_chip.json...
  - Loading interview_questions.json...
  - Loading leetcode_questions.json...
  - Loading llm_generated_iqs.json...
  - Loading system_design_iqs.json...


In [93]:
len(final_data)

717

In [94]:
ids = [data["id"] for data in final_data]

In [95]:
len(ids) == len(set(ids))

True

In [96]:
len(ids)

717

In [100]:
len([data["mistakes"] for data in final_data if "mistakes" in data])

0

In [103]:
with open("/home/alokpadhi/ai-interview-system/data/rubrics/all_rubrics.json", "r") as f:
    rubrics = json.load(f)

In [110]:
assert list(rubrics.keys()) == ids

In [111]:
with open("/home/alokpadhi/ai-interview-system/data/rubrics/manual_review_needed.json", "r") as fp:
    reviews = json.load(fp)

In [112]:
set(review['reason'] for review in reviews)

{'Hard question - needs detailed rubric',
 'Math question - verify key steps',
 'No reference answer',
 'System design - needs custom criteria'}

In [1]:
files = ["ml_concepts.json", "wikipedia_ml_concepts_part2.json", "wikipedia_ml_concepts.json"]

In [2]:
concepts = []

In [3]:
import json

In [4]:
for file in files:
    with open("/home/alokpadhi/ai-interview-system/data/datasets/processed/concepts/"+file, "r") as f:
        data = json.load(f)
        concepts.extend(data)

In [5]:
len(concepts)

125