In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import nltk
from nltk.corpus import stopwords
import string

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import files
uploaded = files.upload()

Saving dataset.tsv to dataset.tsv


In [None]:
# Load data
data = pd.read_csv("dataset.tsv", sep='\t', encoding='ISO-8859-1')

In [None]:
# Text preprocessing function with spell check
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    return tokens

# Apply preprocessing to essays
data['processed_essay'] = data['essay'].apply(preprocess_text)

# Train Word2Vec model
w2v_model = Word2Vec(sentences=data['processed_essay'], vector_size=100, window=5, min_count=2, workers=4)

# Function to convert essays to Word2Vec vectors
def essay_to_vectors(essays, model):
    essay_vectors = []
    for essay in essays:
        words = [word for word in essay if word in model.wv]
        if words:
            essay_vector = np.mean(model.wv[words], axis=0)
        else:
            essay_vector = np.zeros(model.vector_size)
        essay_vectors.append(essay_vector)
    return np.array(essay_vectors)

# Convert essays to Word2Vec vectors
X = essay_to_vectors(data['processed_essay'], w2v_model)
y = data['domain1_score'].values

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to predict score for a new essay
def predict_grade(new_essay, model, w2v_model):
    processed_essay = preprocess_text(new_essay)
    essay_vector = essay_to_vectors([processed_essay], w2v_model)
    score = model.predict(essay_vector)[0]
    return score

In [None]:
# Define and train the SVR model
svr_model = SVR(kernel='rbf', C=1.0, epsilon=0.2)
svr_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred1 = svr_model.predict(X_val)
print(y_pred1)

mae1 = mean_absolute_error(y_val, y_pred1)
print(f'Validation Mean Absolute Error: {mae1}')

mse1 = mean_squared_error(y_val, y_pred1)
print(f'Validation Mean Squared Error: {mse1}')

print("Score:",svr_model.score(X_val,y_val))

[ 1.21074769  1.44545444  9.03673208 ...  8.58722421  6.8270422
 20.24633846]
Validation Mean Absolute Error: 2.0700621407559545
Validation Mean Squared Error: 22.84099037125046
Score: 0.7066322665234345


In [None]:
# Define and train the k-NN model
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred2 = knn_model.predict(X_val)
print(y_pred2)

mae2 = mean_absolute_error(y_val, y_pred2)
print(f'Validation Mean Absolute Error: {mae2}')

mse2 = mean_squared_error(y_val, y_pred2)
print(f'Validation Mean Squared Error: {mse2}')

print("Score:",knn_model.score(X_val,y_val))

[ 0.8  2.4  9.4 ...  9.2 26.8 34.8]
Validation Mean Absolute Error: 1.879275808936826
Validation Mean Squared Error: 16.018412942989215
Score: 0.7942608694896465


In [None]:
# Define and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate the model
y_pred3 = rf_model.predict(X_val)
print(y_pred3)

mae3 = mean_absolute_error(y_val, y_pred3)
print(f'Validation Mean Absolute Error: {mae3}')

mse3 = mean_squared_error(y_val, y_pred3)
print(f'Validation Mean Squared Error: {mse3}')

print("Score:",rf_model.score(X_val,y_val))

[ 1.15  1.32  8.74 ...  9.1  11.85 35.95]
Validation Mean Absolute Error: 1.5975950179763738
Validation Mean Squared Error: 10.37741604228728
Score: 0.8667133528719009
