In [5]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize,word_tokenize
from gensim.models import Word2Vec
from keras.layers import Embedding, LSTM, Dense, Dropout, Lambda, Flatten
from keras.models import Sequential, load_model
import keras.backend as K
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import cohen_kappa_score

**Preparing Dataset**

In [6]:
df = pd.read_csv("Dataset/training_set_rel3.tsv", sep='\t', encoding='ISO-8859-1');
df.dropna(axis=1,inplace=True)
df.drop(columns=['domain1_score','rater1_domain1','rater2_domain1'],inplace=True,axis=1)
df.head()
temp = pd.read_csv("Processed_data.csv")
temp.drop("Unnamed: 0",inplace=True,axis=1)

In [7]:
df['domain1_score']=temp['final_score']
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,1,1,"Dear local newspaper, I think effects computer...",6
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",7
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",5
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",8
4,5,1,"Dear @LOCATION1, I know having computers has a...",6


In [8]:
df['essay'][0]

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

In [9]:
temp.head(1)

Unnamed: 0,essay_id,essay_set,essay,final_score,clean_essay,char_count,word_count,sent_count,avg_word_len,spell_err_count,noun_count,adj_count,verb_count,adv_count
0,1,1,"Dear local newspaper, I think effects computer...",6,Dear local newspaper I think effects computer...,1441,344,16,4.188953,11,76,75,18,24


In [10]:
y = df['domain1_score']
df.drop('domain1_score',inplace=True,axis=1)
X=df

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
X_train.shape

(9083, 3)

**PREPROCESSING**

In [13]:
train_e = X_train['essay'].tolist()
test_e = X_test['essay'].tolist()

In [14]:
train_sents=[]
test_sents=[]

stop_words = set(stopwords.words('english')) 
def sent2word(x):
    x=re.sub("[^A-Za-z]"," ",x)
    x.lower()
    filtered_sentence = [] 
    words=x.split()
    for w in words:
        if w not in stop_words: 
            filtered_sentence.append(w)
    return filtered_sentence

def essay2word(essay):
    essay = essay.strip()
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw = tokenizer.tokenize(essay)
    final_words=[]
    for i in raw:
        if(len(i)>0):
            final_words.append(sent2word(i))
    return final_words

for i in train_e:
    train_sents+=essay2word(i)

for i in test_e:
    test_sents+=essay2word(i)

In [15]:
len(train_sents)

115924

In [16]:
train_sents[0]

['It',
 'first',
 'day',
 'high',
 'school',
 'gut',
 'full',
 'butterflies',
 'make',
 'want',
 'run',
 'bathrooms',
 'hide',
 'world']

**Preparing WORD2VEC and LSTM Model**

In [34]:
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Bidirectional, BatchNormalization
from keras.regularizers import l2
from keras.callbacks import EarlyStopping

def get_model():
    model = Sequential()

    model.add(Bidirectional(LSTM(300, dropout=0.4, recurrent_dropout=0.4, return_sequences=True, kernel_regularizer=l2(0.01)), input_shape=[1, 300]))
    
    model.add(Bidirectional(LSTM(256, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)))
    
    model.add(Bidirectional(LSTM(128, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)))
    
    model.add(Bidirectional(LSTM(64, dropout=0.4, recurrent_dropout=0.4, return_sequences=True)))

    model.add(Bidirectional(LSTM(32, dropout=0.4, recurrent_dropout=0.4)))

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())

    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(BatchNormalization())

    model.add(Dense(1))  # Linear activation for regression

    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])

    model.summary()

    return model


In [35]:
from gensim.models import Word2Vec

num_features = 300  
min_word_count = 40  
num_workers = 4  
context = 10 
downsampling = 1e-3 

model = Word2Vec(train_sents, 
                 workers=num_workers, 
                 vector_size=num_features, 
                 min_count=min_word_count, 
                 window=context, 
                 sample=downsampling)

model.wv.save_word2vec_format('word2vecmodel.bin', binary=True)



In [36]:

def makeVec(words, model, num_features):
    vec = np.zeros((num_features,), dtype="float32")
    noOfWords = 0.
    index2word_set = set(model.wv.index_to_key) 
    for i in words:
        if i in index2word_set:
            noOfWords += 1
            vec = np.add(vec, model.wv[i]) 
    if noOfWords > 0:  
        vec = np.divide(vec, noOfWords)
    return vec

def getVecs(essays, model, num_features):
    c = 0
    essay_vecs = np.zeros((len(essays), num_features), dtype="float32")
    for i in essays:
        essay_vecs[c] = makeVec(i, model, num_features)
        c += 1
    return essay_vecs

clean_train = []
for i in train_e:
    clean_train.append(sent2word(i))
training_vectors = getVecs(clean_train, model, num_features)

clean_test = []
for i in test_e:
    clean_test.append(sent2word(i))
testing_vectors = getVecs(clean_test, model, num_features)


In [37]:
training_vectors.shape

(9083, 300)

In [38]:
training_vectors = np.array(training_vectors)
testing_vectors = np.array(testing_vectors)

training_vectors = np.reshape(training_vectors, (training_vectors.shape[0], 1, training_vectors.shape[1]))
testing_vectors = np.reshape(testing_vectors, (testing_vectors.shape[0], 1, testing_vectors.shape[1]))
lstm_model = get_model()

  super().__init__(**kwargs)


In [39]:
training_vectors.shape

(9083, 1, 300)


**TRAINING AND PREDICTION**







In [40]:
lstm_model.fit(training_vectors, y_train, batch_size=64, epochs=150, validation_data=(testing_vectors, y_test))

Epoch 1/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 53ms/step - loss: 41.4972 - mae: 5.4480
Epoch 2/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 53ms/step - loss: 16.5636 - mae: 3.2562
Epoch 3/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 53ms/step - loss: 8.9006 - mae: 2.2474
Epoch 4/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - loss: 7.7376 - mae: 2.0953
Epoch 5/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 55ms/step - loss: 7.1546 - mae: 2.0092
Epoch 6/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 60ms/step - loss: 6.5765 - mae: 1.9467
Epoch 7/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 58ms/step - loss: 6.2666 - mae: 1.9074
Epoch 8/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 57ms/step - loss: 6.0562 - mae: 1.8587
Epoch 9/150
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

<keras.src.callbacks.history.History at 0x23d40112410>

In [41]:
lstm_model.save('final_lstm.h5')
y_pred = lstm_model.predict(testing_vectors)
y_pred = np.around(y_pred)
y_pred



[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 20ms/step


array([[3.],
       [5.],
       [7.],
       ...,
       [7.],
       [7.],
       [9.]], dtype=float32)

In [42]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")

MAE: 1.416902132031852
MSE: 3.526329309016183
RMSE: 1.8778523128873001
