In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Load data
train = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/train.csv")
test = pd.read_csv("/kaggle/input/map-charting-student-math-misunderstandings/test.csv")

# Filter True_Misconception rows
train_miscon = train[train['Category'] == "True_Misconception"].copy()
train_miscon = train_miscon.dropna(subset=['Misconception'])

# Concatenate question and explanation
train_miscon['text'] = train_miscon['QuestionText'].fillna('') + " " + train_miscon['StudentExplanation'].fillna('')
test['input_text'] = test['QuestionText'].fillna('') + " " + test['StudentExplanation'].fillna('')

# Create misconception-level text corpus
miscon_dict = train_miscon.groupby('Misconception')['text'].apply(lambda x: " ".join(x)).to_dict()
miscon_ids = list(miscon_dict.keys())
miscon_texts = list(miscon_dict.values())

# Combine all texts
all_texts = miscon_texts + test['input_text'].tolist()

# TF-IDF vectorizers
word_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=20000, stop_words='english')
char_vectorizer = TfidfVectorizer(analyzer='char_wb', ngram_range=(3, 5), max_features=10000)

word_matrix = word_vectorizer.fit_transform(all_texts)
char_matrix = char_vectorizer.fit_transform(all_texts)

# Split
num_miscon = len(miscon_texts)
miscon_word = word_matrix[:num_miscon]
test_word = word_matrix[num_miscon:]
miscon_char = char_matrix[:num_miscon]
test_char = char_matrix[num_miscon:]

# Similarities
sim_word = cosine_similarity(test_word, miscon_word)
sim_char = cosine_similarity(test_char, miscon_char)

# Combine similarity matrices (tune weights as needed)
sim_total = 0.7 * sim_word + 0.3 * sim_char

# Build predictions
predictions = []
for row in sim_total:
    top_k = row.argsort()[-25:][::-1]
    pred = [f"False_Misconception:{miscon_ids[i]}" for i in top_k]
    predictions.append(" ".join(pred))

# Create submission
submission = pd.DataFrame({
    'row_id': test['row_id'],
    'Category:Misconception': predictions
})

submission.to_csv("submission.csv", index=False)
print(submission)

   row_id                             Category:Misconception
0   36696  False_Misconception:WNB False_Misconception:In...
1   36697  False_Misconception:WNB False_Misconception:In...
2   36698  False_Misconception:Shorter_is_bigger False_Mi...
