In [1]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install nltk
!pip install textblob
!pip install textstat


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m69.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m4.2 MB/s[0m eta [36m

In [10]:
import pandas as pd
import numpy as np
import spacy
import nltk
from textstat import textstat
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob
import re
from tqdm.notebook import tqdm
import concurrent.futures

nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')



[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
dataset = pd.read_csv('/content/Processed_data.csv')

def calculate_readability(text):
    return textstat.flesch_reading_ease(text)

def calculate_punctuation_score(text):
    punctuation_count = sum([1 for char in text if char in "!?.;"])
    return punctuation_count / len(text.split())

def calculate_vocabulary_richness(text):
    words = word_tokenize(text)
    unique_words = set(words)
    return len(unique_words) / len(words) if words else 0

def calculate_complex_sentence_ratio(text):
    doc = nlp(text)
    complex_sentences = sum(1 for sent in doc.sents if sum(1 for token in sent if token.dep_ != 'punct') > 10)
    return complex_sentences / len(list(doc.sents)) if len(list(doc.sents)) > 0 else 0

def calculate_clause_density(text):
    doc = nlp(text)
    clauses = sum(len(list(token.subtree)) for token in doc if token.dep_ in ('csubj', 'advcl', 'acl', 'relcl'))
    return clauses / len(list(doc.sents)) if len(list(doc.sents)) > 0 else 0

def calculate_semantic_coherence(text):
    sentences = sent_tokenize(text)
    if len(sentences) < 2:
        return 0
    embeddings = [nlp(sent).vector for sent in sentences]
    cosine_similarities = [
        (embeddings[i] @ embeddings[i+1].T) / (np.linalg.norm(embeddings[i]) * np.linalg.norm(embeddings[i+1]))
        for i in range(len(embeddings) - 1)
    ]
    return sum(cosine_similarities) / len(cosine_similarities)

def calculate_sentiment_subjectivity(text):
    blob = TextBlob(text)
    return blob.sentiment.subjectivity

def calculate_transitional_phrase_use(text):
    transitional_phrases = ["however", "therefore", "moreover", "furthermore", "nevertheless"]
    words = word_tokenize(text.lower())
    return sum(1 for word in words if word in transitional_phrases) / len(words)

def calculate_figurative_language_use(text):
    # Placeholder: Identify figurative language based on patterns or specific phrases
    return len(re.findall(r"like|as if|seems|metaphorically", text.lower())) / len(text.split())

def calculate_question_usage(text):
    return text.count('?') / len(sent_tokenize(text)) if text else 0

# Wrapper for processing rows faster
def process_row(row):
    text = row['essay']
    return {
        'readability_score': calculate_readability(text),
        'punctuation_score': calculate_punctuation_score(text),
        'vocabulary_richness': calculate_vocabulary_richness(text),
        'complex_sentence_ratio': calculate_complex_sentence_ratio(text),
        'clause_density': calculate_clause_density(text),
        'semantic_coherence': calculate_semantic_coherence(text),
        'sentiment_subjectivity': calculate_sentiment_subjectivity(text),
        'transitional_phrase_use': calculate_transitional_phrase_use(text),
        'figurative_language_use': calculate_figurative_language_use(text),
        'question_usage': calculate_question_usage(text)
    }

# Apply functions with a progress bar
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
    for result in tqdm(executor.map(process_row, dataset.to_dict('records')), total=len(dataset)):
        results.append(result)

# Convert results to a DataFrame and merge with the original dataset
results_df = pd.DataFrame(results)
dataset = pd.concat([dataset, results_df], axis=1)

# Save the updated dataset to a new file
dataset.to_csv('/mnt/data/Updated_Processed_Data.csv', index=False)

print("Feature extraction complete. The updated dataset has been saved.")


  0%|          | 0/12976 [00:00<?, ?it/s]

OSError: Cannot save file into a non-existent directory: '/mnt/data'

In [12]:
# Save the updated dataset to a new file
dataset.to_csv('Updated_Processed_Data.csv', index=False)

print("Feature extraction complete. The updated dataset has been saved.")

Feature extraction complete. The updated dataset has been saved.


In [15]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, cohen_kappa_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, SimpleRNN
from tensorflow.keras.callbacks import ModelCheckpoint
import joblib

# Load the dataset
dataset = pd.read_csv('Updated_Processed_Data.csv')

# Features and target
X = dataset.drop(columns=['final_score', 'essay', 'essay_id', 'clean_essay'])  # Drop non-predictive or target columns
y = dataset['final_score']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save the scaler
joblib.dump(scaler, 'scaler.pkl')

# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    mse = mean_squared_error(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    qwk = cohen_kappa_score(y_true, np.round(y_pred), weights='quadratic')
    print(f"{model_name} Evaluation:")
    print(f"MSE: {mse}, MAE: {mae}, QWK: {qwk}\n")
    return mse, mae, qwk

# Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
lr_pred = lr.predict(X_test_scaled)
evaluate_model(y_test, lr_pred, "Linear Regression")

# Save Linear Regression model
joblib.dump(lr, 'linear_regression_model.pkl')

# ANN Model
ann = Sequential([
    Dense(64, activation='relu', input_dim=X_train_scaled.shape[1]),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])
ann.compile(optimizer='adam', loss='mse', metrics=['mae'])
ann_checkpoint = ModelCheckpoint('ann_model.keras', save_best_only=True)
ann.fit(X_train_scaled, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[ann_checkpoint], verbose=1)
ann_pred = ann.predict(X_test_scaled).flatten()
evaluate_model(y_test, ann_pred, "ANN")

# LSTM Model
X_train_lstm = X_train_scaled.reshape(X_train_scaled.shape[0], 1, X_train_scaled.shape[1])
X_test_lstm = X_test_scaled.reshape(X_test_scaled.shape[0], 1, X_test_scaled.shape[1])

lstm = Sequential([
    LSTM(64, activation='tanh', input_shape=(1, X_train_scaled.shape[1]), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])
lstm.compile(optimizer='adam', loss='mse', metrics=['mae'])
lstm_checkpoint = ModelCheckpoint('lstm_model.h5', save_best_only=True)
lstm.fit(X_train_lstm, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[lstm_checkpoint], verbose=1)
lstm_pred = lstm.predict(X_test_lstm).flatten()
evaluate_model(y_test, lstm_pred, "LSTM")

# RNN Model
rnn = Sequential([
    SimpleRNN(64, activation='tanh', input_shape=(1, X_train_scaled.shape[1]), return_sequences=False),
    Dense(32, activation='relu'),
    Dense(1, activation='linear')
])
rnn.compile(optimizer='adam', loss='mse', metrics=['mae'])
rnn_checkpoint = ModelCheckpoint('rnn_model.h5', save_best_only=True)
rnn.fit(X_train_lstm, y_train, validation_split=0.2, epochs=50, batch_size=32, callbacks=[rnn_checkpoint], verbose=1)
rnn_pred = rnn.predict(X_test_lstm).flatten()
evaluate_model(y_test, rnn_pred, "RNN")

# Function to predict essay scores using trained models
def predict_essay_score(features):
    # Load scaler and models
    scaler = joblib.load('scaler.pkl')
    lr_model = joblib.load('linear_regression_model.pkl')
    ann_model = Sequential([
        Dense(64, activation='relu', input_dim=len(features)),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    ann_model.load_weights('ann_model.h5')
    lstm_model = Sequential([
        LSTM(64, activation='tanh', input_shape=(1, len(features)), return_sequences=False),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    lstm_model.load_weights('lstm_model.h5')
    rnn_model = Sequential([
        SimpleRNN(64, activation='tanh', input_shape=(1, len(features)), return_sequences=False),
        Dense(32, activation='relu'),
        Dense(1, activation='linear')
    ])
    rnn_model.load_weights('rnn_model.h5')

    # Scale features
    features_scaled = scaler.transform([features])
    features_scaled_lstm = features_scaled.reshape(1, 1, len(features))

    # Predict using each model
    predictions = {
        'Linear Regression': lr_model.predict(features_scaled)[0],
        'ANN': ann_model.predict(features_scaled)[0][0],
        'LSTM': lstm_model.predict(features_scaled_lstm)[0][0],
        'RNN': rnn_model.predict(features_scaled_lstm)[0][0]
    }
    return predictions

# Function to score example essays
def score_example_essays(example_features):
    print("Scoring example essays with all trained models:\n")
    for idx, features in enumerate(example_features):
        print(f"Example Essay {idx + 1}:")
        predictions = predict_essay_score(features)
        for model_name, score in predictions.items():
            print(f"{model_name}: {score:.2f}")
        print("\n")

print("All models trained and saved.")


Linear Regression Evaluation:
MSE: 4.2607014105933905, MAE: 1.6412571645379062, QWK: 0.45173544773977703



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - loss: 21.3923 - mae: 3.7622 - val_loss: 9.0758 - val_mae: 1.9014
Epoch 2/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 5.3840 - mae: 1.7859 - val_loss: 5.2321 - val_mae: 1.6715
Epoch 3/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - loss: 4.2724 - mae: 1.5977 - val_loss: 3.6788 - val_mae: 1.4740
Epoch 4/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - loss: 3.4276 - mae: 1.4408 - val_loss: 3.2968 - val_mae: 1.3659
Epoch 5/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 3.0384 - mae: 1.3420 - val_loss: 2.9846 - val_mae: 1.2906
Epoch 6/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 2.8025 - mae: 1.2818 - val_loss: 3.3848 - val_mae: 1.2681
Epoch 7/50
[1m260/260[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step -

  super().__init__(**kwargs)


ValueError: The filepath provided must end in `.keras` (Keras model format). Received: filepath=lstm_model.h5