In [None]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from keras.optimizers import Adam
from nltk.corpus import stopwords
import string
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import files
uploaded = files.upload()

Saving dataset.tsv to dataset.tsv


In [None]:
# Load your essay data
data = pd.read_csv("dataset.tsv", sep='\t', encoding='ISO-8859-1')

In [None]:
# Text preprocessing function
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to essays
data['processed_essay'] = data['essay'].apply(preprocess_text)

# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['processed_essay'].values)
X = tokenizer.texts_to_sequences(data['processed_essay'].values)
X = pad_sequences(X, maxlen=500)  # Assume maximum length of essay is 500 tokens

# Prepare target variable
y = data['domain1_score'].values

# Train Word2Vec model
w2v_model = Word2Vec(data['processed_essay'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=2, workers=4)
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define LSTM model
model = Sequential()
model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=500, trainable=False))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='linear'))



In [None]:
# Compile model
model.compile(loss='mean_squared_error', optimizer=Adam(), metrics=['mean_absolute_error'])

# Train model
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val), verbose=2)

Epoch 1/10
163/163 - 237s - loss: 54.0424 - mean_absolute_error: 4.0966 - val_loss: 39.1500 - val_mean_absolute_error: 4.1028 - 237s/epoch - 1s/step
Epoch 2/10
163/163 - 225s - loss: 24.0149 - mean_absolute_error: 2.6713 - val_loss: 20.7836 - val_mean_absolute_error: 2.6140 - 225s/epoch - 1s/step
Epoch 3/10
163/163 - 229s - loss: 13.2234 - mean_absolute_error: 2.1048 - val_loss: 8.8128 - val_mean_absolute_error: 1.7431 - 229s/epoch - 1s/step
Epoch 4/10
163/163 - 225s - loss: 7.9400 - mean_absolute_error: 1.6900 - val_loss: 5.7120 - val_mean_absolute_error: 1.4645 - 225s/epoch - 1s/step
Epoch 5/10
163/163 - 225s - loss: 5.8486 - mean_absolute_error: 1.5110 - val_loss: 5.7358 - val_mean_absolute_error: 1.4275 - 225s/epoch - 1s/step
Epoch 6/10
163/163 - 224s - loss: 5.1179 - mean_absolute_error: 1.4239 - val_loss: 5.3586 - val_mean_absolute_error: 1.3411 - 224s/epoch - 1s/step
Epoch 7/10
163/163 - 231s - loss: 4.8573 - mean_absolute_error: 1.3781 - val_loss: 4.2798 - val_mean_absolute_err

In [None]:
# Evaluate the model
y_pred = model.predict(X_val)
print(y_pred)

r2 = r2_score(y_val, y_pred)
print(f'R^2 Score: {r2}')

mae = mean_absolute_error(y_val, y_pred)
print(f'Validation Mean Absolute Error: {mae}')

mse = mean_squared_error(y_val, y_pred)
print(f'Validation Mean Squared Error: {mse}')

[[ 1.8452257]
 [ 1.4090451]
 [ 9.590812 ]
 ...
 [ 9.577797 ]
 [ 2.6839232]
 [36.394894 ]]
R^2 Score: 0.949312421821623
Validation Mean Absolute Error: 1.1945075444670505
Validation Mean Squared Error: 3.9464274799218693


In [None]:
# Function to predict score for a new essay
def predict_score(new_essay, model, tokenizer, max_length=500):
    processed_essay = preprocess_text(new_essay)
    sequence = tokenizer.texts_to_sequences([processed_essay])
    padded_sequence = pad_sequences(sequence, maxlen=max_length)
    predicted_score = model.predict(padded_sequence, verbose=0)
    return predicted_score[0][0]

# Predict score for a new essay
new_essay = "The computer blinked to life and an image of a blonde haired girl filled the screen. It was easy to find out how life was in thanks to the actual girl explaining it. Going to the library wouldn't have filled one with this priceless information and human interection. Computers are a nessessity of life if soceity wishes to grow and expand. They should be supported because they teach hand eye coordination, give people the ability to learn about faraway places, and allow people to talk to others online. Firstly, computers help teach hand eye coordination. Hand-eye coordination is a useful ability that is usod to excel in sports. In a recent survey, of kids felt their hand eye coordination improves after computer use. Even a simple thing like tying can build up this skill. Famous neurologist stated in an article last week that, @CAPS3 and computer strength the When on the computer, you automatically process what the eyes see into a command for your hands. hand eye coordination can improve people in sports such as baseball and basketball. If someone wan't to become better in these sports, all they'd need to do was turn on the computer. Once people become better at sports, they're more likely to play them and become more healthy. In reality, computers can help with exercising instead of decreasing it. Additionaly, computers allow people to access information about faraway places and people. If someone wanted to reasearch all they'd need to do was type in a search would be presented to them in it would link forever to search through countless things. Also, having the ability to learn about cultures can make peole peole and their cultures, they understand others something. Increase tolerance people are. Computers are a resourceful tool that they can help people in every different aspect of life. Lastly, computer and in technology can allow people to chat. Computer chat and video chat can help the all different nations. Bring on good terms places other than can help us understand story comes out about something that happend in people can just go on their computer and ask an actual citizen their take on the matter. Also, video chat and online conversation can cut down on expensive phone bills. No one wants to pay more than they have to in this economy. Another good point is that you can acess family members you scaresly visit. It can help you connect within your own family more. Oviously, computers are a useful aid in todays era. their advancements push the world foreward to a better place. Computers can help people because they help teach handeye coordination, give people the bility to learn about faraway places and people, and allow people to talk online with others. Think of a world with no computers or technologicall advancements. The world would be sectored and unified, contact between people scare, and information even. The internet is like thousands or librarys put together. Nobody would know much about other nations and news would travel slower. Is that the kind of palce you want people to live in? "
predicted_score = predict_score(new_essay, model, tokenizer)
print(f"Predicted Score: {predicted_score}")

Predicted Score: 9.258330345153809
