# **Automated Essay Scoring System**
*  Description
 The Automated Essay Scoring System leverages advanced natural language processing (NLP) techniques to evaluate and grade essays based on various criteria, including content, coherence, grammar, and more. Using deep learning models, specifically Long Short-Term Memory (LSTM) networks, this system aims to provide accurate and consistent essay scores, mimicking human grading.

# **Tools**
* TensorFlow: Framework for building and training the LSTM model.
* Keras: High-level API for TensorFlow to facilitate model creation.
* PyTorch: Optional for any additional modeling or experimentation.
* NLTK: Library for text processing and feature extraction.
* SpaCy: Library for advanced NLP tasks, including tokenization and lemmatization.
# **Features**
* Automated Scoring: Grades essays based on multiple criteria.
* Content Analysis: Evaluates the relevance and richness of content.
* Coherence Assessment: Measures the logical flow and structure of the essay.
* Grammar Checking: Identifies and scores grammatical accuracy.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [1]:
import re
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense,Dropout,LSTM,GRU,Embedding
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split

In [None]:
plt.style.use('ggplot')
%matplotlib inline

# **Data Preparation**
* Collect and preprocess your essay dataset

In [None]:
df = pd.read_csv('/kaggle/input/autoscoredetect/Processed_data.csv')
df.head(3)

In [None]:
df.columns

In [None]:
data = df[['essay','final_score']]

In [None]:
plt.title('Number of essay getting Star')
data['final_score'].value_counts().plot(kind='bar')

In [None]:
data

In [2]:
# USE NLP TO MAKE TO REMOVE UNRECOMENDED TEXT

In [None]:
import re
stopwords_set = set(stopwords.words('english'))
emoji = re.compile('(?::|;|=)(?:-)?(?:\)|\(|D|P)')

def Clean_text(text):
    cleanText = re.sub('http\S+\s', ' ', text)
    cleanText = re.sub('RT|cc', ' ', cleanText)
    cleanText = re.sub('#\S+\s', ' ', cleanText)
    cleanText = re.sub('@\S+', '  ', cleanText)  
    cleanText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', cleanText)
    cleanText = re.sub(r'[^\x00-\x7f]', ' ', cleanText) 
    cleanText = re.sub('\s+', ' ', cleanText)
    return cleanText


In [None]:
X = data['essay'].apply(lambda x:Clean_text(x))
y = data['final_score']

# **Data Preprocessing¶**
* Text Cleaning: Remove special characters, punctuation, and numbers. Tokenization: Convert text into a sequence of tokens (words). Padding: Ensure all sequences have the same length by padding shorter sequences. 

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
max_len = max(len(x) for x in sequences)
print(max_len)

In [None]:
vocab_size = len(tokenizer.word_index) +1

In [None]:
sequences_data = pad_sequences(sequences,maxlen=max_len,padding='pre')
sequences_data

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scalar = MinMaxScaler(feature_range=(0, 10))
reshaped = data['final_score'].values.reshape(-1, 1)
y = scalar.fit_transform(reshaped).flatten()

In [3]:
# Train 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(sequences_data, y, test_size=0.33, random_state=42)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64))
model.add(Dense(11,activation='softmax')) 
model.build((None, max_len))

In [None]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['Accuracy']
)
model.summary()

In [None]:
history = model.fit(X_train,y_train,epochs=20,validation_split=0.3,validation_data=(X_test,y_test))

In [None]:
plt.plot(model.history.history['Accuracy'])
plt.plot(model.history.history['val_Accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train','test'],loc='upper left')

In [None]:
plt.plot(model.history.history['loss'])
plt.plot(model.history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train','test'],loc='upper left')

In [None]:
y_pred = model.predict(X_test)

In [None]:
loss,accuracy = model.evaluate(X_test,y_test)

In [None]:
predicted = np.array(y_pred)
predicted

In [None]:
def predict(text):
    tokenize = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([tokenize],maxlen=max_len-1,padding='pre')
    predicted = model.predict(token_list,verbose=0)
    predicted = np.argmax(np.array(predicted),axis=1)
    print(predicted )
    return predicted 

In [None]:
text = data['essay'][2]
predict(text)

In [None]:
import pickle
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

In [None]:
model.save('model.h5')