<a href="https://colab.research.google.com/github/namankansal000/AutoJudge/blob/main/MainCode.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_json('/content/problems_data.jsonl', lines=True)

print("Dataset Info:")
print(df.info())
print("\nFirst 5 rows:")
print(df.head())
print("\nClass distribution:")
print(df['problem_class'].value_counts())
print("\nScore statistics:")
print(df['problem_score'].describe())

# Visualize class distribution
plt.figure(figsize=(10, 5))
plt.subplot(1, 2, 1)
df['problem_class'].value_counts().plot(kind='bar')
plt.title('Class Distribution')

plt.subplot(1, 2, 2)
df['problem_score'].hist(bins=30)
plt.title('Score Distribution')
plt.tight_layout()
plt.show()

In [None]:

# ------------------------------
# 1Ô∏è‚É£ Imports
# ------------------------------
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from collections import Counter

# Download stopwords (first time only)
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# ------------------------------
# 2Ô∏è‚É£ Text preprocessing functions
# ------------------------------
def preprocess_text(text):
    """Clean text and remove stopwords"""
    if pd.isna(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text).strip()
    text = ' '.join([w for w in text.split() if w not in stop_words])
    return text

def combine_text_fields(row):
    """Combine all text fields into one"""
    combined = f"{row['title']} {row['description']} {row['input_description']} {row['output_description']}"
    return preprocess_text(combined)

# ------------------------------
# 3Ô∏è‚É£ Apply preprocessing to dataframe
# ------------------------------
df['combined_text'] = df.apply(combine_text_fields, axis=1)
df['text_length'] = df['combined_text'].apply(len)

# Map class labels to numbers
class_mapping = {'hard': 0, 'medium': 1, 'easy': 2}
df['class_label'] = df['problem_class'].map(class_mapping)

print(f"Processed {len(df)} samples")
print(f"Average text length: {df['text_length'].mean():.0f} characters")
print("Class distribution before split:")
print(df['class_label'].value_counts())

# ------------------------------
# 4Ô∏è‚É£ Convert text to numeric features (TF-IDF)
# ------------------------------
vectorizer = TfidfVectorizer(max_features=5000)  # limit vocab size
X = vectorizer.fit_transform(df['combined_text'])
y = df['class_label']

# ------------------------------
# 5Ô∏è‚É£ Train/test split (stratified)
# ------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nClass distribution in training set (before SMOTE):")
print(Counter(y_train))

# ------------------------------
# 6Ô∏è‚É£ Handle class imbalance with SMOTE
# ------------------------------
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nClass distribution in training set (after SMOTE):")
print(Counter(y_train_res))

# ------------------------------
# ‚úÖ Now X_train_res, y_train_res and X_test, y_test are ready for training classifiers
# ------------------------------


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack, csr_matrix
from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np

# ------------------------------
# 1Ô∏è‚É£ TF-IDF features
# ------------------------------
tfidf = TfidfVectorizer(
    max_features=5000,   # Limit number of features
    min_df=5,             # Ignore words appearing in <5 documents
    max_df=0.7,           # Ignore words appearing in >70% documents
    stop_words='english',
    ngram_range=(1, 2)    # Unigrams + Bigrams
)

X_tfidf = tfidf.fit_transform(df['combined_text'])

# ------------------------------
# 2Ô∏è‚É£ Scale numeric features
# ------------------------------
scaler = StandardScaler()
text_len_scaled = scaler.fit_transform(df['text_length'].values.reshape(-1, 1))

# Combine TF-IDF + numeric features
X_features = hstack([X_tfidf, csr_matrix(text_len_scaled)])

# ------------------------------
# 3Ô∏è‚É£ Prepare labels
# ------------------------------
y_class = df['class_label'].values
y_score = df['problem_score'].values  # if you also want regression

# ------------------------------
# 4Ô∏è‚É£ Train/test split (stratified)
# ------------------------------
X_train, X_test, y_class_train, y_class_test = train_test_split(
    X_features, y_class, test_size=0.2, random_state=42, stratify=y_class
)

print("Class distribution in training set (before SMOTE):", Counter(y_class_train))

# ------------------------------
# 5Ô∏è‚É£ Handle class imbalance with SMOTE
# ------------------------------
smote = SMOTE(random_state=42)
X_train_res, y_class_train_res = smote.fit_resample(X_train, y_class_train)

print("Class distribution in training set (after SMOTE):", Counter(y_class_train_res))

# ------------------------------
# ‚úÖ Now ready for classifier training
# X_train_res, y_class_train_res -> train
# X_test, y_class_test -> test
# ------------------------------
print(f"Training samples: {X_train_res.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print(f"Number of features: {X_train_res.shape[1]}")


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from sklearn.svm import LinearSVC

# Train different classifiers
classifiers = {
    'Logistic Regression': LogisticRegression(max_iter=3000, random_state=42,class_weight='balanced'),
    'Random Forest': RandomForestClassifier(
    n_estimators=300,
    max_depth=30,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    class_weight='balanced',
    n_jobs=-1,
    random_state=42
)
,
    'svm' : LinearSVC(class_weight='balanced',max_iter=5000)

}

results = {}
for name, clf in classifiers.items():
    print(f"\nTraining {name}...")
    clf.fit(X_train, y_class_train)

    # Predictions
    y_pred = clf.predict(X_test)

    # Evaluation
    accuracy = accuracy_score(y_class_test, y_pred)
    results[name] = {
        'model': clf,
        'accuracy': accuracy
    }

    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(classification_report(y_class_test, y_pred,
                                target_names=['hard', 'medium', 'easy']))

    # Confusion Matrix
    cm = confusion_matrix(y_class_test, y_pred)
    print("Confusion Matrix:")
    print(cm)

# Select best classifier
best_clf_name = max(results, key=lambda x: results[x]['accuracy'])
best_clf = results[best_clf_name]['model']
print(f"\nBest classifier: {best_clf_name} with accuracy: {results[best_clf_name]['accuracy']:.4f}")

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
import os

# Ensure the 'models' directory exists
os.makedirs("models", exist_ok=True)

X_train, X_test, y_class_train, y_class_test, y_score_train, y_score_test = train_test_split(
    X_features,
    y_class,
    y_score,
    test_size=0.2,
    random_state=42,
    stratify=y_class
)
lr = LinearRegression()
lr.fit(X_train, y_score_train)

y_pred_lr = lr.predict(X_test)

mae_lr = mean_absolute_error(y_score_test, y_pred_lr)
rmse_lr = np.sqrt(mean_squared_error(y_score_test, y_pred_lr))

print("Linear Regression")
print(f"MAE: {mae_lr:.3f}")
print(f"RMSE: {rmse_lr:.3f}")
rf_reg = RandomForestRegressor(
    n_estimators=300,
    max_depth=30,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

rf_reg.fit(X_train, y_score_train)

y_pred_rf = rf_reg.predict(X_test)

mae_rf = mean_absolute_error(y_score_test, y_pred_rf)
rmse_rf = np.sqrt(mean_squared_error(y_score_test, y_pred_rf))

print("\nRandom Forest Regressor")
print(f"MAE: {mae_rf:.3f}")
print(f"RMSE: {rmse_rf:.3f}")
gbr = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=5,
    random_state=42
)

gbr.fit(X_train, y_score_train)

y_pred_gb = gbr.predict(X_test)

mae_gb = mean_absolute_error(y_score_test, y_pred_gb)
rmse_gb = np.sqrt(mean_squared_error(y_score_test, y_pred_gb))

print("\nGradient Boosting Regressor")
print(f"MAE: {mae_gb:.3f}")
print(f"RMSE: {rmse_gb:.3f}")
results_reg = {
    "Linear Regression": mae_lr,
    "Random Forest": mae_rf,
    "Gradient Boosting": mae_gb
}

best_reg_name = min(results_reg, key=results_reg.get)
print(f"\nBest Regression Model: {best_reg_name}")
best_regressor = {
    "Linear Regression": lr,
    "Random Forest": rf_reg,
    "Gradient Boosting": gbr
}[best_reg_name]

joblib.dump(best_regressor, "models/regressor.pkl")

In [None]:
import os
import joblib

os.makedirs("models", exist_ok=True)

joblib.dump(best_clf, "models/classifier.pkl")
joblib.dump(best_regressor, "models/regressor.pkl")
joblib.dump(tfidf, "models/tfidf.pkl")
joblib.dump(scaler, "models/scaler.pkl")


In [None]:
%%writefile app.py
import streamlit as st
import joblib
import re
import numpy as np
from scipy.sparse import hstack, csr_matrix

# -------------------------
# Load Models
# -------------------------
classifier = joblib.load("models/classifier.pkl")
regressor = joblib.load("models/regressor.pkl")
tfidf = joblib.load("models/tfidf.pkl")
scaler = joblib.load("models/scaler.pkl")

# -------------------------
# Text Preprocessing
# -------------------------
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# -------------------------
# Streamlit UI
# -------------------------
st.set_page_config(page_title="AutoJudge", layout="centered")

st.title("ü§ñ AutoJudge")
st.subheader("Predict Programming Problem Difficulty")

st.markdown("Enter the problem details below:")

title = st.text_input("Problem Title")
description = st.text_area("Problem Description", height=200)
input_desc = st.text_area("Input Description", height=150)
output_desc = st.text_area("Output Description", height=150)

if st.button("Predict Difficulty"):
    if not description.strip():
        st.warning("Please enter at least the problem description.")
    else:
        combined_text = f"{title} {description} {input_desc} {output_desc}"
        combined_text = preprocess_text(combined_text)

        text_tfidf = tfidf.transform([combined_text])
        text_len = scaler.transform([[len(combined_text)]])
        X = hstack([text_tfidf, csr_matrix(text_len)])

        class_pred = classifier.predict(X)[0]
        score_pred = regressor.predict(X)[0]

        class_map = {0: "Hard", 1: "Medium", 2: "Easy"}

        st.success("Prediction Complete ‚úÖ")
        st.markdown(f"### üß† Predicted Difficulty: **{class_map[class_pred]}**")
        st.markdown(f"### ‚≠ê Predicted Score: **{score_pred:.2f} / 10**")
