In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV,train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import seaborn as sns
from tqdm import tqdm
from sklearn.utils import Bunch
import joblib

In [None]:
data = pd.read_csv('Quiz_Question_Data_2.csv')

In [80]:
skill_mapping = {'Beginner': 0, 'Intermediate': 1, 'Advanced': 2}
data['Skill_Level_encoded'] = data['Skill Level'].map(skill_mapping)

# Multiply the 'Time Taken (QNS/Minutes)' column by 60 to convert to seconds
data['Time Taken (QNS/Seconds)'] = (data['Time Taken (QNS/Minutes)'] * 60).round(4)

# Drop the original 'Time Taken (QNS/Minutes)' column
data.drop(columns=['Time Taken (QNS/Minutes)'], inplace=True)

data['Score'] = data['Score'].replace('%', '', regex=True).astype(float)

# Optionally, save the modified data back to a new Excel file
data.to_csv('modified_file.csv', index=False)

In [82]:
import pandas as pd

# Check for missing values in the entire dataset
missing_data = data.isnull().sum()
print(missing_data)

# Check if there are any missing values
if data.isnull().values.any():
    print("There are missing values in the dataset.")
else:
    print("No missing values in the dataset.")

data = data.dropna()

Question Difficulty         0
Topic Difficulty            0
Score                       0
Skill Level                 0
Skill_Level_encoded         0
Time Taken (QNS/Seconds)    0
dtype: int64
No missing values in the dataset.


In [26]:
data = pd.read_csv('modified_file.csv')
X = data[['Time Taken (QNS/Seconds)', 'Question Difficulty', 'Topic Difficulty', 'Score']]
Y = data['Skill_Level_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Training set size: (24000, 4)
Testing set size: (6000, 4)


In [28]:
rf = RandomForestClassifier(random_state=42) 
param_grid = {
    'n_estimators': [450,500,550],
    'max_depth': [10,12,14],
    'max_features': ['log2',None,'sqrt'],
    'min_samples_split': [30, 35,40],
    'min_samples_leaf': [15,18,20],
    'bootstrap': [True,False]
}
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=100, random_state=42, n_jobs=-1, verbose=2)

random_search.fit(X_train_scaled, y_train)
print("Best parameters:", random_search.best_params_)

model = random_search.best_estimator_
y_pred = model.predict(X_test_scaled)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters: {'n_estimators': 500, 'min_samples_split': 30, 'min_samples_leaf': 15, 'max_features': 'log2', 'max_depth': 14, 'bootstrap': False}


In [33]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["Beginner", "Intermediate", "Advanced"]))

Accuracy: 98.83%
Classification Report:
              precision    recall  f1-score   support

    Beginner       0.99      0.97      0.98      1591
Intermediate       0.98      0.99      0.99      2487
    Advanced       1.00      1.00      1.00      1922

    accuracy                           0.99      6000
   macro avg       0.99      0.99      0.99      6000
weighted avg       0.99      0.99      0.99      6000



In [35]:
joblib.dump(model, 'skill_level_model.joblib')

['skill_level_model.joblib']

In [37]:
model = joblib.load('skill_level_model.joblib')
scaler = StandardScaler()
scaler.fit(X_train)

In [39]:
new_data = pd.DataFrame({
    'Time Taken (QNS/Seconds)': [4.998, 19.2],
    'Question Difficulty': [5.2, 5.2],
    'Topic Difficulty': [7, 7],
    'Score': [50,50]
})

new_data_scaled = scaler.transform(new_data)

# Predict with the trained model
predictions = model.predict(new_data_scaled)
label_mapping = {0: "Beginner", 1: "Intermediate", 2: "Advanced"}

# Decode predictions
decoded_predictions = [label_mapping[pred] for pred in predictions]

print("Predictions:", decoded_predictions)

Predictions: ['Intermediate', 'Beginner']
