In [None]:
# === Setup Java 17 and Install Required Libraries ===
!apt-get remove openjdk-11-jdk -y
!apt-get install openjdk-17-jdk -y
!update-alternatives --install /usr/bin/java java /usr/lib/jvm/java-17-openjdk-amd64/bin/java 1
!update-alternatives --set java /usr/lib/jvm/java-17-openjdk-amd64/bin/java
!java -version

# === Install Python Packages ===
!pip install -U language-tool-python textstat nltk gradio
!wget --no-check-certificate https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
!unzip -q glove.6B.zip

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
Package 'openjdk-11-jdk' is not installed, so not removed
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1
  openjdk-17-jdk-headless openjdk-17-jre openjdk-17-jre-headless x11-utils
Suggested packages:
  libxt-doc openjdk-17-demo openjdk-17-source visualvm libnss-mdns
  fonts-ipafont-gothic fonts-ipafont-mincho fonts-wqy-microhei
  | fonts-wqy-zenhei fonts-indic mesa-utils
The following NEW packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libxt-dev libxtst6 libxxf86dga1 openjdk-17-jdk
  openjdk-17-jdk-headless openjdk-17-jre openjdk-17-jre-headless x1

In [None]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt_tab')
sample_text = "This is random text"
tokens = word_tokenize(sample_text)

print(tokens)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


['This', 'is', 'random', 'text']


In [None]:
import pandas as pd
import numpy as np
import re
import pickle
import nltk
import textstat
import gradio as gr
import language_tool_python
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, cohen_kappa_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping



nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Data/essay_llama3_8B_groq.csv").dropna()
df.head(5)

Unnamed: 0.1,Unnamed: 0,essay_id,full_text,score,cleaned_text
0,0,000d118,Many people have car where they live. The thin...,3,Many people have a car where they live. The th...
1,1,000fe60,I am a scientist at NASA that is discussing th...,3,"I am a scientist at NASA, discussing the ""face..."
2,2,001ab80,People always wish they had the same technolog...,4,People always wish they had the same technolog...
3,3,001bdc0,"We all heard about Venus, the planet without a...",4,"We all heard about Venus, the planet with almo..."
4,4,002ba53,"Dear, State Senator\n\nThis is a letter to arg...",3,"Dear State Senator,\n\nThis is a letter to arg..."


In [None]:
df = pd.read_csv("/content/drive/MyDrive/Data/essay_llama3_8B_groq.csv").dropna()

def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    text = re.sub(r'\d+', '', text)
    return text

df['cleaned_essay'] = df['cleaned_text'].apply(clean_text)

X = df['cleaned_essay']
y = df['score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
max_words = 10000
max_len = 300
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train)

X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=max_len)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=max_len)


In [None]:
embedding_index = {}
with open("glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
word_index = tokenizer.word_index
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [None]:
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_len,
              weights=[embedding_matrix], trainable=False),
    LSTM(128),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='linear')
])

model.compile(optimizer='adam', loss='mean_squared_error')
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(X_train_seq, y_train, epochs=5, batch_size=32,
                    validation_split=0.1, callbacks=[early_stop])




Epoch 1/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 456ms/step - loss: 1.5409 - val_loss: 0.9440
Epoch 2/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 456ms/step - loss: 1.0655 - val_loss: 0.6047
Epoch 3/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m215s[0m 489ms/step - loss: 0.7092 - val_loss: 0.5828
Epoch 4/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 469ms/step - loss: 0.6805 - val_loss: 0.5735
Epoch 5/5
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 469ms/step - loss: 0.6680 - val_loss: 0.5404


In [None]:
y_pred = model.predict(X_test_seq)
y_pred_rounded = np.clip(np.rint(y_pred), 0, 6)

print("MSE:", mean_squared_error(y_test, y_pred))
print("QWK:", cohen_kappa_score(y_test.astype(int), y_pred_rounded.astype(int), weights='quadratic'))

[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 363ms/step
MSE: 0.5900195837020874
QWK: 0.5964576548226079


In [None]:
model.save("/content/drive/MyDrive/aes_lstm_glove_model.keras")
with open("/content/drive/MyDrive/aes_tokenizer_glove.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
def extract_keywords(text, top_n=20):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform([text])
    scores = X.toarray()[0]
    indices = np.argsort(scores)[::-1][:top_n]
    feature_names = np.array(vectorizer.get_feature_names_out())
    return feature_names[indices]

def content_score(essay_text, keywords):
    essay_lower = essay_text.lower()
    matched_keywords = [word for word in keywords if word in essay_lower]
    score = len(matched_keywords) / len(keywords) * 4  # Max 4 points for content
    return round(score, 2), matched_keywords

def generate_feedback(pred_score, essay_text, content_keywords):
    import language_tool_python
    tool = language_tool_python.LanguageTool('en-US')

    if pred_score >= 5:
        summary = "Excellent structure and clear arguments. Well done!"
    elif pred_score >= 4:
        summary = "Good essay overall. You can improve argument depth and cohesion."
    elif pred_score >= 3:
        summary = "Fair effort. Focus on clearer structure and stronger evidence."
    elif pred_score >= 2:
        summary = "Needs improvement. Clarify your position and organize ideas better."
    else:
        summary = "Work on developing your thesis and supporting it with clear reasons."

    feedback_points = []
    words = word_tokenize(essay_text)
    sentences = sent_tokenize(essay_text)

    if len(words) < 150:
        feedback_points.append("Essay is too short — expand your arguments.")
    if len(sentences) < 5:
        feedback_points.append("Use more sentence variety and elaboration.")
    if textstat.flesch_reading_ease(essay_text) < 40:
        feedback_points.append("Simplify sentence structure for better clarity.")
    if "however" not in essay_text.lower() and "on the other hand" not in essay_text.lower():
        feedback_points.append("Use transition words to improve logical flow.")

    matches = tool.check(essay_text)
    if len(matches) > 0:
        feedback_points.append(f"Grammar issues detected: {len(matches)} potential problems.")

    content_marks, matched_keywords = content_score(essay_text, content_keywords)
    feedback_points.append(f"Content Score: {content_marks}/4. Keywords matched: {', '.join(matched_keywords)}")

    suggestions = " ".join(feedback_points)
    return f"{summary}\n\nSuggestions:\n{suggestions if suggestions else 'None. Keep it up!'}"

In [None]:
def score_essay(essay):
    try:
        extracted_keywords = extract_keywords(essay)
        cleaned = clean_text(essay)
        seq = tokenizer.texts_to_sequences([cleaned])
        padded = pad_sequences(seq, maxlen=max_len)
        pred = model.predict(padded)
        score = int(np.clip(np.rint(pred), 0, 6).item())
        feedback = generate_feedback(score, essay, extracted_keywords)
        return score, feedback
    except Exception as e:
        return "Error", f"Exception occurred: {str(e)}"

interface = gr.Interface(
    fn=score_essay,
    inputs=gr.Textbox(lines=20, label="Enter Essay"),
    outputs=[
        gr.Number(label="Predicted Score (0–6)"),
        gr.Textbox(label="Feedback")
    ],
    title="Automated Essay Scoring (AES) with LSTM + GloVe",
    description="Enter an argumentative essay to receive an automated score and feedback, including content-based evaluation.",
    examples=[
        ["Technology has significantly improved our lives in many ways. From faster communication..."],
        ["Wales had been integrated into the Kingdom of England by the Laws in Wales Acts 1535..."]
    ]
)

interface.launch(share=True)



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cbd452f81a47fe703f.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


