In [76]:
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from gensim.models import Word2Vec

In [77]:
# Load dataset
def load_json(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        data = json.load(f)
        if isinstance(data, dict):  # If JSON is stored inside a dictionary
            data = data.get("questions", [])  # Adjust the key if necessary
        return data

In [78]:
# Load train and test data
train_data = load_json(r"/kaggle/input/book1-dtaset/train.json")
test_data = load_json(r"/kaggle/input/book1-dtaset/test.json")


In [79]:
# Extract questions and labels
train_questions = [item["question_latex"] for item in train_data]
train_labels = [item["chapter"] for item in train_data]
test_questions = [item["question_latex"] for item in test_data]
test_labels = [item["chapter"] for item in test_data]

In [80]:
# Text vectorization + Classification model pipeline using Decision Tree
model = make_pipeline(TfidfVectorizer(), DecisionTreeClassifier(max_depth=10))

In [85]:
# Train model
model.fit(train_questions, train_labels)

In [86]:
# Predictions
predictions = model.predict(test_questions)

In [87]:
# Compute Metrics
accuracy = accuracy_score(test_labels, predictions)
precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average="weighted")
conf_matrix = confusion_matrix(test_labels, predictions)


In [88]:
# Print Results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:\n", classification_report(test_labels, predictions))

Accuracy: 0.4486
Precision: 0.7587
Recall: 0.4486
F1 Score: 0.4744

Classification Report:
                                     precision    recall  f1-score   support

        Computations with Matrices       0.54      0.31      0.39        65
                      Determinants       0.97      0.77      0.86        74
      Eigenvalues and Eigenvectors       0.76      0.38      0.51        91
Linear Programming and Game Theory       1.00      0.32      0.49        65
 Matrices and Gaussian Elimination       0.21      0.89      0.34        96
                     Orthogonality       0.82      0.20      0.32        71
        Positive Definite Matrices       0.94      0.45      0.61        71
                     Vector Spaces       0.92      0.22      0.36       109

                          accuracy                           0.45       642
                         macro avg       0.77      0.44      0.48       642
                      weighted avg       0.76      0.45      0.47     

In [89]:
# Print Confusion Matrix
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[20  0  3  0 42  0  0  0]
 [ 0 57  0  0 17  0  0  0]
 [ 4  1 35  0 49  1  1  0]
 [ 4  0  1 21 39  0  0  0]
 [ 1  0  6  0 85  2  0  2]
 [ 0  0  1  0 56 14  0  0]
 [ 8  0  0  0 31  0 32  0]
 [ 0  1  0  0 83  0  1 24]]


In [90]:
# Define vectorization techniques
vectorizers = {
    "BoW": CountVectorizer(),
    "TF-IDF": TfidfVectorizer()
}

In [91]:
# Train Word2Vec model
sentences = [q.split() for q in train_questions]
word2vec = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def vectorize_w2v(texts):
    vectors = []
    for text in texts:
        words = text.split()
        word_vectors = [word2vec.wv[word] for word in words if word in word2vec.wv]
        vectors.append(np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100))
    return np.array(vectors)

vectorizers["Word2Vec"] = vectorize_w2v


In [92]:
# Define models with hyperparameter tuning
model_params = {
 
    "Naive Bayes": (MultinomialNB(),
                    {"alpha": [0.1, 1, 10]}),
    "KNN": (KNeighborsClassifier(),
            {"n_neighbors": [3, 5, 10], "weights": ["uniform", "distance"]}),
    "Decision Tree": (DecisionTreeClassifier(),
                      {"max_depth": [5, 10, 20]}),
    
}

In [None]:
# Evaluate models with hyperparameter tuning
for vec_name, vectorizer in vectorizers.items():
    print(f"\n=== Using {vec_name} Vectorization ===")
    
    if vec_name == "Word2Vec":
        X_train = vectorizer(train_questions)
        X_test = vectorizer(test_questions)
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    else:
        X_train = vectorizer.fit_transform(train_questions)
        X_test = vectorizer.transform(test_questions)
    
    for model_name, (model, param_grid) in model_params.items():
        
        # Skip Naive Bayes if using Word2Vec
        if vec_name == "Word2Vec" and model_name == "Naive Bayes":
            print(f"Skipping Naive Bayes for {vec_name} (requires non-negative features)")
            continue  # Now correctly placed inside the condition
        
        print(f"\nModel: {model_name}")
        grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy', n_jobs=-1)
        grid_search.fit(X_train, train_labels)
        best_model = grid_search.best_estimator_
        predictions = best_model.predict(X_test)
        
        accuracy = accuracy_score(test_labels, predictions)
        precision, recall, f1, _ = precision_recall_fscore_support(test_labels, predictions, average="weighted")
        conf_matrix = confusion_matrix(test_labels, predictions)
        
        print(f"Best Params: {grid_search.best_params_}")
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print("Confusion Matrix:")
        print(conf_matrix)


=== Using BoW Vectorization ===

Model: Naive Bayes
Best Params: {'alpha': 0.1}
Accuracy: 0.7882
Precision: 0.7937
Recall: 0.7882
F1 Score: 0.7897
Confusion Matrix:
[[53  2  3  0  2  3  1  1]
 [ 2 58  3  2  8  1  0  0]
 [ 2  3 64  0  9  3  0 10]
 [ 0  1  0 62  0  0  0  2]
 [ 1  4  5  0 66  5  1 14]
 [ 3  0  2  0  6 55  0  5]
 [ 2  3  3  0  0  1 61  1]
 [ 0  5  0  0 11  6  0 87]]

Model: KNN
Best Params: {'n_neighbors': 5, 'weights': 'distance'}
Accuracy: 0.5483
Precision: 0.6083
Recall: 0.5483
F1 Score: 0.5423
Confusion Matrix:
[[ 9  5 13  0  9  3  4 22]
 [ 0 47  6  1 10  0  2  8]
 [ 0  6 60  0  8  3  2 12]
 [ 0  2  3 52  0  0  0  8]
 [ 0  5 13  1 41  1  3 32]
 [ 1  5  5  2  6 35  2 15]
 [ 0  3  8  1  2  1 48  8]
 [ 0 10 12  0 18  7  2 60]]

Model: Decision Tree
Best Params: {'max_depth': 20}
Accuracy: 0.5872
Precision: 0.7019
Recall: 0.5872
F1 Score: 0.5991
Confusion Matrix:
[[28  0  4  1 15 16  0  1]
 [ 0 53  0  1 13  6  1  0]
 [ 3  2 45  0 24 13  2  2]
 [ 2  0  1 50  9  3  0  0]
 [