# Preprocessing and Feature Engineering

In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textstat import textstat
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import shap
import re

nltk.download('punkt')
nltk.download('stopwords')

# Load and clean data

In [None]:
df = pd.read_csv('processed_essays.csv')
df['cleaned_essay'] = df['essay'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

# Feature extraction


In [None]:
df['word_count'] = df['cleaned_essay'].apply(lambda x: len(x.split()))
df['flesch_reading_ease'] = df['cleaned_essay'].apply(textstat.flesch_reading_ease)
df['vocab_richness'] = df['cleaned_essay'].apply(lambda x: len(set(word_tokenize(x))) / len(word_tokenize(x)))
df['avg_sentence_length'] = df['cleaned_essay'].apply(lambda x: np.mean([len(sentence.split()) for sentence in nltk.sent_tokenize(x)]))

# TF-IDF feature extraction


In [None]:
tfidf = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf.fit_transform(df['cleaned_essay'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())
df = pd.concat([df, tfidf_df], axis=1)

# Model training with RandomizedSearchCV


In [None]:
X = df.drop(['essay', 'cleaned_essay', 'grade'], axis=1)
y = df['grade']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf = RandomForestClassifier(random_state=42)
param_dist = {
    'n_estimators': [100, 200, 500, 1000],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}
random_search = RandomizedSearchCV(rf, param_distributions=param_dist, n_iter=50, cv=5, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Model evaluation


In [None]:
best_rf = random_search.best_estimator_
y_pred = best_rf.predict(X_test)
print("Best Parameters:", random_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Interpret model with SHAP


In [None]:
explainer = shap.TreeExplainer(best_rf)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=X.columns)

# Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

conf_matrix = confusion_matrix(y_test, y_pred)

# Visualize heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=best_rf.classes_, yticklabels=best_rf.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Grades')
plt.ylabel('Actual Grades')
plt.show()
