In [1]:
import joblib
data=joblib.load("training_data.pkl")

In [13]:
import pandas as pd
x=data["user_input"]
y_class=data["problem_class"]
y_score=data["problem_score"]

In [4]:
from sklearn.model_selection import train_test_split

# splitting that dataset to two parts - first for training (~80%), second for testing (~20%) (about 800 problems)
# random state keeps the randomness of the split fixed
# stratify keeps the proportion of problems (easy,med,hard) equal in train and test set to ensure that training and test set dont lack any kind of class

x_train, x_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
    x,
    y_class,
    y_score,
    test_size=0.20,
    random_state=42,
    stratify=y_class
)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TF-IDF is used to mark words acc to their frequency
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1,2),
    stop_words="english"
)

x_train_tfidf = tfidf.fit_transform(x_train)
x_test_tfidf = tfidf.transform(x_test)
joblib.dump(tfidf , "tfidf.pkl")

['tfidf.pkl']

In [6]:
keywords = ["greedy","dp","dynamic programming","tree","graph","dfs","bfs","two pointers","binary search","bitmasks","dfs","combinatorics"]
def extra_features(text):
    return [
        len(text),
        len(text.split()),
        sum(text.count(k) for k in keywords),
        sum(1 for c in text if c in "+-*/%")
    ]

In [7]:
from scipy.sparse import hstack
import numpy as np

x_train_extra = np.array(x_train.apply(extra_features).tolist())
x_test_extra = np.array(x_test.apply(extra_features).tolist())

x_train_final = hstack([x_train_tfidf, x_train_extra])
x_test_final = hstack([x_test_tfidf, x_test_extra])

In [8]:
# Classification through logistic regression
from sklearn.linear_model import LogisticRegression

clf_lr = LogisticRegression(max_iter=5000, class_weight="balanced")
clf_lr.fit(x_train_final, y_class_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
clf_lr

In [10]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

test_lr = clf_lr.predict(x_test_final)
print("Logistic Regression Accuracy:", 100*accuracy_score(y_class_test, test_lr), "%")

Logistic Regression Accuracy: 46.65856622114216 %


In [11]:
print("Logistic Regression Report:\n", classification_report(y_class_test, test_lr))

Logistic Regression Report:
               precision    recall  f1-score   support

        easy       0.38      0.60      0.46       153
        hard       0.61      0.52      0.56       389
      medium       0.37      0.32      0.35       281

    accuracy                           0.47       823
   macro avg       0.45      0.48      0.46       823
weighted avg       0.48      0.47      0.47       823



In [14]:
# Confusion Matrix for Logistic Regression
cm_lr=pd.DataFrame(confusion_matrix(y_class_test, test_lr))
cm_lr.columns=["Easy","Med.","Hard"]
cm_lr.index=["Easy","Med.","Hard"]
cm_lr

Unnamed: 0,Easy,Med.,Hard
Easy,92,20,41
Med.,74,201,114
Hard,79,111,91


In [15]:
# # Classification through Linear SVM

from sklearn.svm import LinearSVC
clf_svm = LinearSVC(max_iter=5000, random_state=42, class_weight="balanced")
clf_svm.fit(x_train_final, y_class_train)



In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

test_svm = clf_svm.predict(x_test_final)
print("Linear SVM Accuracy:", 100*accuracy_score(y_class_test, test_svm), "%")

Linear SVM Accuracy: 21.020656136087485 %


In [17]:
print("Linear SVM Report:\n", classification_report(y_class_test, test_svm))

Linear SVM Report:
               precision    recall  f1-score   support

        easy       0.19      0.97      0.32       153
        hard       0.63      0.06      0.11       389
      medium       0.00      0.00      0.00       281

    accuracy                           0.21       823
   macro avg       0.27      0.35      0.14       823
weighted avg       0.33      0.21      0.11       823



In [18]:
# Confusion Matrix for Linear SVM
cm_svm=pd.DataFrame(confusion_matrix(y_class_test, test_svm))
cm_svm.columns=["Easy","Med.","Hard"]
cm_svm.index=["Easy","Med.","Hard"]
cm_svm

Unnamed: 0,Easy,Med.,Hard
Easy,149,3,1
Med.,365,24,0
Hard,270,11,0


In [19]:
# Classification through Multinomial Naive Bayes

from sklearn.naive_bayes import MultinomialNB
clf_mnb = MultinomialNB()
clf_mnb.fit(x_train_final, y_class_train)

In [20]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

test_mnb = clf_mnb.predict(x_test_final)
print("Multinomial Naive Bayes Accuracy:", 100*accuracy_score(y_class_test, test_mnb), "%")

Multinomial Naive Bayes Accuracy: 48.23815309842041 %


In [21]:
print("Multinomial Naive Bayes Report:\n", classification_report(y_class_test, test_mnb))

Multinomial Naive Bayes Report:
               precision    recall  f1-score   support

        easy       0.53      0.05      0.10       153
        hard       0.50      0.90      0.65       389
      medium       0.35      0.14      0.19       281

    accuracy                           0.48       823
   macro avg       0.46      0.36      0.31       823
weighted avg       0.45      0.48      0.39       823



In [22]:
# Confusion Matrix for MNB
cm_nb=pd.DataFrame(confusion_matrix(y_class_test, test_mnb))
cm_nb.columns=["Easy","Med.","Hard"]
cm_nb.index=["Easy","Med.","Hard"]
cm_nb

Unnamed: 0,Easy,Med.,Hard
Easy,8,109,36
Med.,2,351,36
Hard,5,238,38


In [23]:
# Classification through Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

clf_rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    class_weight="balanced",
    random_state=42,
    n_jobs=-1
)

clf_rf.fit(x_train_final, y_class_train)


In [24]:
test_rf=clf_rf.predict(x_test_final)
print("Random Forest Accuracy:", 100*accuracy_score(y_class_test, test_rf), "%")

Random Forest Accuracy: 52.61239368165249 %


In [25]:
print("Random Forest Classifier Report:\n", classification_report(y_class_test, test_rf))

Random Forest Classifier Report:
               precision    recall  f1-score   support

        easy       0.54      0.38      0.44       153
        hard       0.54      0.85      0.66       389
      medium       0.43      0.16      0.24       281

    accuracy                           0.53       823
   macro avg       0.50      0.46      0.45       823
weighted avg       0.50      0.53      0.48       823



In [26]:
# Confusion Matrix for Random Forest
cm_rf=pd.DataFrame(confusion_matrix(y_class_test, test_rf))
cm_rf.columns=["Easy","Med.","Hard"]
cm_rf.index=["Easy","Med.","Hard"]
cm_rf

Unnamed: 0,Easy,Med.,Hard
Easy,58,73,22
Med.,22,329,38
Hard,28,207,46
