In [1]:
# prompt: open Processed_data.csv\

import pandas as pd

try:
  df = pd.read_csv('Processed_data.csv')
  print(df.head()) # Print first few rows to verify
except FileNotFoundError:
  print("Error: 'Processed_data.csv' not found.")
except pd.errors.EmptyDataError:
  print("Error: 'Processed_data.csv' is empty.")
except pd.errors.ParserError:
  print("Error: Unable to parse 'Processed_data.csv'. Check the file format.")
except Exception as e:
  print(f"An unexpected error occurred: {e}")


   Unnamed: 0  essay_id  essay_set  \
0           0         1          1   
1           1         2          1   
2           2         3          1   
3           3         4          1   
4           4         5          1   

                                               essay  final_score  \
0  Dear local newspaper, I think effects computer...            6   
1  Dear I believe that using computers will benef...            7   
2  Dear, More and more people use computers, but ...            5   
3  Dear Local Newspaper, I have found that many e...            8   
4  Dear I know having computers has a positive ef...            6   

                                         clean_essay  char_count  word_count  \
0  Dear local newspaper  I think effects computer...        1441         344   
1  Dear I believe using computers benefit us many...        1765         413   
2  Dear  More people use computers  everyone agre...        1185         276   
3  Dear Local Newspaper  I found man

In [2]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

In [3]:
max_words = 10000  # Vocabulary size
max_len = 200      # Maximum length of sequences
embedding_dim = 128

# Text Tokenization and Padding
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['essay'])
sequences = tokenizer.texts_to_sequences(df['essay'])
padded_sequences = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

In [4]:
scores = df['final_score'].values

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(padded_sequences, scores, test_size=0.2, random_state=42)


In [5]:
y_train = y_train / 10.0  # Assuming scores are out of 10
y_val = y_val / 10.0


In [6]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len),
    LSTM(128, return_sequences=False),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')  # Use 'sigmoid' for normalized scores
])



In [7]:
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

In [8]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,
    batch_size=32,
    verbose=1
)


Epoch 1/10
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 408ms/step - loss: 0.0576 - mae: 0.1934 - val_loss: 0.0537 - val_mae: 0.1871
Epoch 2/10
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 405ms/step - loss: 0.0463 - mae: 0.1718 - val_loss: 0.0341 - val_mae: 0.1458
Epoch 3/10
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 394ms/step - loss: 0.0360 - mae: 0.1486 - val_loss: 0.0300 - val_mae: 0.1364
Epoch 4/10
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 393ms/step - loss: 0.0333 - mae: 0.1433 - val_loss: 0.0255 - val_mae: 0.1235
Epoch 5/10
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 394ms/step - loss: 0.0255 - mae: 0.1233 - val_loss: 0.0240 - val_mae: 0.1208
Epoch 6/10
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 389ms/step - loss: 0.0222 - mae: 0.1155 - val_loss: 0.0241 - val_mae: 0.1207
Epoch 7/10
[1m325/325[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0

In [9]:
loss, mae = model.evaluate(X_val, y_val, verbose=1)
print(f"Validation MAE: {mae}")

[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 127ms/step - loss: 0.0281 - mae: 0.1280
Validation MAE: 0.12565644085407257


In [10]:
def predict_score(essay, tokenizer, model, max_len=200, scale_factor=10.0):
    """
    Predicts the final score for a given essay.

    Parameters:
    - essay (str): The essay text to score.
    - tokenizer (Tokenizer): The trained tokenizer used during model training.
    - model (Sequential): The trained LSTM model.
    - max_len (int): Maximum length of sequences (used for padding). Default is 200.
    - scale_factor (float): The factor to scale the normalized score. Default is 10.0.

    Returns:
    - float: The predicted final score.
    """
    # Preprocess the input essay
    sequence = tokenizer.texts_to_sequences([essay])  # Tokenize
    padded_sequence = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post')

    # Predict using the model
    normalized_score = model.predict(padded_sequence)[0][0]  # Model predicts a normalized score
    final_score = normalized_score * scale_factor  # Scale back to original range (e.g., 0-10)

    return final_score


In [16]:
if __name__ == "__main__":
  sample_essay = """
    AI is in education now. It personalizes learning and helps disabled students with tools like screen readers. Teachers don’t have to grade as much because AI does it. But there are problems like privacy and less human interaction. Still, AI will probably stay in education.
"""

  score = predict_score(sample_essay, tokenizer, model)
  print(f"Predicted Score: {score}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 90ms/step
Predicted Score: 4.370096921920776


In [14]:

model.save('my_model.keras')