In [122]:
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
import nltk

# Download necessary NLTK resources
nltk.download('punkt') # This downloads the 'punkt' resource needed for word_tokenize
nltk.download('punkt_tab')
nltk.download('stopwords') # This downloads the missing 'stopwords' resource
nltk.download('wordnet') # This downloads the 'wordnet' resource needed for lemmatization

# Load dataset
essays = pd.read_csv("/content/Processed_data.csv", usecols=[3])  # Read only the 4th column (index 3)
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word not in stopwords.words('english') and word not in string.punctuation]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

essays['cleaned_text'] = essays['essay'].apply(preprocess)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [137]:
import pandas as pd
import spacy
import nltk
from textblob import TextBlob
from collections import Counter
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tag import pos_tag
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

class EssayDatasetAnalyzer:
    def __init__(self):
        # Initialize required NLTK resources
        nltk.download('punkt', quiet=True)
        nltk.download('averaged_perceptron_tagger', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('vader_lexicon', quiet=True)

        # Load models and resources
        self.nlp = spacy.load('en_core_web_sm')
        self.sia = SentimentIntensityAnalyzer()
        self.stop_words = set(stopwords.words('english'))

    def extract_all_features(self, df):
        """Extract all features from the dataset"""
        print("Extracting features from essays...")

        # Initialize lists to store features
        all_features = []

        # Process each essay with progress bar
        for idx, row in tqdm(df.iterrows(), total=len(df)):
            essay_text = str(row['essay'])  # Convert to string to handle any non-string entries

            # Extract all features for this essay
            features = {
                'essay_id': idx,
                **self.get_basic_stats(essay_text),
                **self.get_vocabulary_features(essay_text),
                **self.get_grammar_features(essay_text),
                **self.get_sentence_complexity(essay_text),
                **self.get_coherence_features(essay_text),
                **self.get_style_features(essay_text),
                **self.get_argumentation_features(essay_text)
            }

            all_features.append(features)

        # Create features DataFrame
        features_df = pd.DataFrame(all_features)

        # Add original essay text if needed
        features_df['original_essay'] = df['essay']

        return features_df

    def get_basic_stats(self, text):
        """Extract basic statistical features"""
        words = word_tokenize(text)
        sentences = sent_tokenize(text)
        characters = len(text)

        return {
            'word_count': len(words),
            'sentence_count': len(sentences),
            'character_count': characters,
            'avg_word_per_sentence': len(words) / max(len(sentences), 1),
            'avg_char_per_word': characters / max(len(words), 1),
            'paragraph_count': len([p for p in text.split('\n') if p.strip()]),
        }

    def get_vocabulary_features(self, text):
        """Extract vocabulary-related features"""
        words = word_tokenize(text.lower())
        content_words = [w for w in words if w.isalnum() and w not in self.stop_words]

        if not content_words:
            return {
                'unique_words': 0,
                'lexical_diversity': 0,
                'avg_word_length': 0,
                'long_words_ratio': 0,
                'stopwords_ratio': 0
            }

        return {
            'unique_words': len(set(content_words)),
            'lexical_diversity': len(set(content_words)) / len(content_words),
            'avg_word_length': np.mean([len(word) for word in content_words]),
            'long_words_ratio': sum(1 for word in content_words if len(word) > 6) / len(content_words),
            'stopwords_ratio': len([w for w in words if w in self.stop_words]) / len(words)
        }

    def get_grammar_features(self, text):
        """Extract grammar-related features"""
        doc = self.nlp(text)
        total_tokens = len(doc)

        if total_tokens == 0:
            return {
                'noun_ratio': 0,
                'verb_ratio': 0,
                'adj_ratio': 0,
                'adv_ratio': 0,
                'pronoun_ratio': 0,
                'conjunction_ratio': 0
            }

        pos_counts = Counter([token.pos_ for token in doc])

        return {
            'noun_ratio': pos_counts['NOUN'] / total_tokens,
            'verb_ratio': pos_counts['VERB'] / total_tokens,
            'adj_ratio': pos_counts['ADJ'] / total_tokens,
            'adv_ratio': pos_counts['ADV'] / total_tokens,
            'pronoun_ratio': pos_counts['PRON'] / total_tokens,
            'conjunction_ratio': pos_counts['CCONJ'] / total_tokens
        }

    def get_sentence_complexity(self, text):
        """Extract sentence complexity features"""
        doc = self.nlp(text)
        sentences = list(doc.sents)

        if not sentences:
            return {
                'avg_sentence_length': 0,
                'avg_clauses_per_sentence': 0,
                'compound_sentence_ratio': 0,
                'complex_sentence_ratio': 0
            }

        clause_counts = []
        compound_sentences = 0
        complex_sentences = 0

        for sent in sentences:
            # Count clauses
            clause_count = 1 + len([token for token in sent
                                  if token.dep_ in ['advcl', 'ccomp', 'xcomp']])
            clause_counts.append(clause_count)

            # Check for compound sentences
            if any(token.dep_ == 'conj' for token in sent):
                compound_sentences += 1

            # Check for complex sentences
            if clause_count > 1:
                complex_sentences += 1

        return {
            'avg_sentence_length': np.mean([len(sent) for sent in sentences]),
            'avg_clauses_per_sentence': np.mean(clause_counts),
            'compound_sentence_ratio': compound_sentences / len(sentences),
            'complex_sentence_ratio': complex_sentences / len(sentences)
        }

    def get_coherence_features(self, text):
        """Extract coherence-related features"""
        sentences = sent_tokenize(text)

        if len(sentences) < 2:
            return {
                'avg_similarity': 0,
                'similarity_variance': 0,
                'transition_words_ratio': 0
            }

        # Analyze sentence embeddings similarity
        sent_embeddings = [self.nlp(sent).vector for sent in sentences]
        similarities = []
        for i in range(len(sent_embeddings)-1):
            similarity = np.dot(sent_embeddings[i], sent_embeddings[i+1])
            similarities.append(float(similarity))

        # Count transition words
        transition_words = set(['however', 'therefore', 'furthermore', 'moreover',
                              'nevertheless', 'thus', 'meanwhile', 'consequently',
                              'similarly', 'in contrast', 'additionally'])

        words = word_tokenize(text.lower())
        transition_count = sum(1 for word in words if word in transition_words)

        return {
            'avg_similarity': np.mean(similarities),
            'similarity_variance': np.var(similarities),
            'transition_words_ratio': transition_count / len(words)
        }

    def get_style_features(self, text):
        """Extract style-related features"""
        sentiment = self.sia.polarity_scores(text)
        doc = self.nlp(text)

        # Calculate formality
        formal_indicators = len([token for token in doc
                               if token.pos_ in ['NOUN', 'ADJ', 'NUM']])
        informal_indicators = len([token for token in doc
                                 if token.pos_ in ['INTJ', 'PART']])

        return {
            'sentiment_positive': sentiment['pos'],
            'sentiment_negative': sentiment['neg'],
            'sentiment_neutral': sentiment['neu'],
            'sentiment_compound': sentiment['compound'],
            'formality_score': formal_indicators / (informal_indicators + 1)
        }

    def get_argumentation_features(self, text):
        """Extract argumentation-related features"""
        text_lower = text.lower()
        words = word_tokenize(text_lower)
        total_words = len(words)

        if total_words == 0:
            return {
                'claims_ratio': 0,
                'evidence_ratio': 0,
                'counter_argument_ratio': 0
            }

        # Count argument markers
        argument_markers = {
            'claims': ['argue', 'claim', 'believe', 'suggest', 'think', 'conclude'],
            'evidence': ['because', 'since', 'therefore', 'consequently', 'research', 'study', 'evidence'],
            'counter_arguments': ['however', 'although', 'nevertheless', 'despite', 'contrary', 'whereas']
        }

        counts = {
            f'{key}_ratio': sum(1 for marker in markers
                               if marker in text_lower) / total_words
            for key, markers in argument_markers.items()
        }

        return counts

def analyze_essay_dataset(input_df):
    """Main function to analyze the essay dataset"""
    # Create analyzer instance
    analyzer = EssayDatasetAnalyzer()

    # Extract features
    features_df = analyzer.extract_all_features(input_df)

    # Basic statistics of features
    feature_stats = features_df.describe()

    return features_df, feature_stats

# Example usage
# Read your dataset
df = pd.read_csv('/content/Processed_data.csv')

# Analyze essays
features_df, feature_stats = analyze_essay_dataset(df)

# Save results
features_df.to_csv('essay_features.csv', index=False)
feature_stats.to_csv('feature_statistics.csv')


Extracting features from essays...


100%|██████████| 12976/12976 [56:49<00:00,  3.81it/s]


In [11]:
import pandas as pd

# Load the dataframe
df2 = pd.read_csv('/content/Processed_data.csv')
features_df = pd.read_csv('/content/essay_features.csv')

# Print some info
print(df2.head())
print(df2.info())

   Unnamed: 0  essay_id  essay_set  \
0           0         1          1   
1           1         2          1   
2           2         3          1   
3           3         4          1   
4           4         5          1   

                                               essay  final_score  \
0  Dear local newspaper, I think effects computer...            6   
1  Dear I believe that using computers will benef...            7   
2  Dear, More and more people use computers, but ...            5   
3  Dear Local Newspaper, I have found that many e...            8   
4  Dear I know having computers has a positive ef...            6   

                                         clean_essay  char_count  word_count  \
0  Dear local newspaper  I think effects computer...        1441         344   
1  Dear I believe using computers benefit us many...        1765         413   
2  Dear  More people use computers  everyone agre...        1185         276   
3  Dear Local Newspaper  I found man

In [12]:
# Assuming 'final_score' column exists in df2 and has the same number of rows as df
features_df['final_score'] = df2['final_score']

In [13]:
features_df.head()

Unnamed: 0,essay_id,word_count,sentence_count,character_count,avg_word_per_sentence,avg_char_per_word,paragraph_count,unique_words,lexical_diversity,avg_word_length,...,sentiment_positive,sentiment_negative,sentiment_neutral,sentiment_compound,formality_score,claims_ratio,evidence_ratio,counter_arguments_ratio,original_essay,final_score
0,0,372,16,1819,23.25,4.889785,1,100,0.666667,5.526667,...,0.172,0.0,0.828,0.9954,5.166667,0.005376,0.002688,0.0,"Dear local newspaper, I think effects computer...",6
1,1,440,17,2205,25.882353,5.011364,1,126,0.591549,5.661972,...,0.224,0.015,0.761,0.9983,10.333333,0.002273,0.002273,0.0,Dear I believe that using computers will benef...,7
2,2,298,14,1482,21.285714,4.973154,1,90,0.697674,6.069767,...,0.201,0.046,0.753,0.9947,8.363636,0.006711,0.0,0.0,"Dear, More and more people use computers, but ...",5
3,3,530,26,2817,20.384615,5.315094,1,170,0.658915,6.244186,...,0.163,0.008,0.829,0.998,7.304348,0.0,0.003774,0.0,"Dear Local Newspaper, I have found that many e...",8
4,4,508,30,2533,16.933333,4.98622,1,129,0.59447,5.917051,...,0.097,0.026,0.878,0.9776,7.1,0.0,0.001969,0.0,Dear I know having computers has a positive ef...,6


Traiing the model

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [15]:
X = features_df.drop(columns=['essay_id','original_essay',	'final_score']).values
y = features_df['final_score'].values

In [16]:
X.shape

(12976, 32)

In [17]:
# Split data into features (X) and target (y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
import numpy as np

# Check for NaNs
print(np.any(np.isnan(X_train)), np.any(np.isnan(y_train)))

# Check for infinite values
print(np.any(np.isinf(X_train)), np.any(np.isinf(y_train)))


False False
False False


In [20]:
# Replace NaNs with 0 or the mean of the column
X_train = np.nan_to_num(X_train, nan=0.0)
y_train = np.nan_to_num(y_train, nan=0.0)


In [21]:
X_train[0]

array([ 1.17840347,  1.37396598,  1.26483133, -0.40920389,  0.27063426,
        0.        ,  0.29760402, -2.26959366,  0.23002019,  0.16870968,
        1.09444117,  0.47008388, -0.111486  ,  0.16092803, -1.32506706,
        0.10251776, -0.43632261, -0.39865283, -0.39430581, -0.24035032,
       -0.37122466,  0.04083269, -0.08499248, -0.24941606, -0.28848892,
        0.27352331,  0.11177191,  0.09697399, -0.49678403, -0.16392111,
       -0.32664153, -0.2538463 ])

In [22]:
print("y_train min:", y_train.min())
print("y_train max:", y_train.max())


y_train min: 0
y_train max: 10


In [23]:
from tensorflow.keras.layers import Dropout

# Build the neural network model
model = Sequential([
    Dense(128, input_dim=X_train.shape[1], activation='relu'),
    Dropout(0.1),
    Dense(64, activation='sigmoid'),
    Dropout(0.1),
    Dense(32, activation='relu'),
    Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [24]:
from tensorflow.keras.models import load_model
from tensorflow.keras.callbacks import ModelCheckpoint

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0005), loss='mse', metrics=['mae', 'mse'])

# Set up ModelCheckpoint to save the best model during training
checkpoint = ModelCheckpoint(
    'best_model.keras',  # Filepath to save the model
    monitor='val_loss',  # Metric to monitor
    save_best_only=True,  # Save only the best model
    mode='min',  # Minimize the monitored metric
    verbose=1
)
# Train the model
history = model.fit(X_train, y_train, validation_split=0.2, epochs=50, batch_size=16, verbose=1, callbacks=[checkpoint])

Epoch 1/50
[1m500/519[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 14.2112 - mae: 2.9766 - mse: 14.2112
Epoch 1: val_loss improved from inf to 4.29307, saving model to best_model.keras
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 13.9589 - mae: 2.9442 - mse: 13.9589 - val_loss: 4.2931 - val_mae: 1.6389 - val_mse: 4.2931
Epoch 2/50
[1m509/519[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 4.0623 - mae: 1.5727 - mse: 4.0623
Epoch 2: val_loss improved from 4.29307 to 3.42081, saving model to best_model.keras
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 4.0584 - mae: 1.5721 - mse: 4.0584 - val_loss: 3.4208 - val_mae: 1.4536 - val_mse: 3.4208
Epoch 3/50
[1m517/519[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - loss: 3.5774 - mae: 1.4767 - mse: 3.5774
Epoch 3: val_loss improved from 3.42081 to 3.12990, saving model to best_model.keras
[1m519/519

In [25]:
# Replace NaNs with 0 or the mean of the column
X_test = np.nan_to_num(X_test, nan=0.0)
y_test = np.nan_to_num(y_test, nan=0.0)


In [26]:
# Evaluate the best saved model
from tensorflow.keras.models import load_model

best_model = load_model('best_model.keras')
loss, mae, mse = best_model.evaluate(X_test, y_test)
print(f"Best Model Test Loss: {loss}, Test MAE: {mae}, Test MSE: {mse}")

[1m82/82[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 2.7515 - mae: 1.2591 - mse: 2.7515
Best Model Test Loss: 2.594965696334839, Test MAE: 1.2272478342056274, Test MSE: 2.594965696334839


In [27]:
import spacy
import nltk
from textblob import TextBlob
from collections import Counter
import numpy as np
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.tag import pos_tag
import warnings
warnings.filterwarnings('ignore')

class SingleEssayAnalyzer:
    def __init__(self):
        # Initialize required NLTK resources
        nltk.download('punkt', quiet=True)
        nltk.download('averaged_perceptron_tagger', quiet=True)
        nltk.download('stopwords', quiet=True)
        nltk.download('vader_lexicon', quiet=True)

        # Load models and resources
        self.nlp = spacy.load('en_core_web_sm')
        self.sia = SentimentIntensityAnalyzer()
        self.stop_words = set(stopwords.words('english'))

    def analyze_essay(self, essay_text):
        """Analyze a single essay and return all features"""
        # Convert to string to handle any non-string input
        essay_text = str(essay_text)

        # Extract all features
        features = {
            **self.get_basic_stats(essay_text),
            **self.get_vocabulary_features(essay_text),
            **self.get_grammar_features(essay_text),
            **self.get_sentence_complexity(essay_text),
            **self.get_coherence_features(essay_text),
            **self.get_style_features(essay_text),
            **self.get_argumentation_features(essay_text)
        }

        # Convert to DataFrame for better visualization
        features_df = pd.DataFrame([features])
        return features_df

    def get_basic_stats(self, text):
        """Get basic statistical features"""
        words = word_tokenize(text)
        sentences = sent_tokenize(text)
        characters = len(text)

        return {
            'word_count': len(words),
            'sentence_count': len(sentences),
            'character_count': characters,
            'avg_word_per_sentence': len(words) / max(len(sentences), 1),
            'avg_char_per_word': characters / max(len(words), 1),
            'paragraph_count': len([p for p in text.split('\n') if p.strip()])
        }

    def get_vocabulary_features(self, text):
        """Get vocabulary-related features"""
        words = word_tokenize(text.lower())
        content_words = [w for w in words if w.isalnum() and w not in self.stop_words]

        if not content_words:
            return {
                'unique_words': 0,
                'lexical_diversity': 0,
                'avg_word_length': 0,
                'long_words_ratio': 0,
                'stopwords_ratio': 0
            }

        return {
            'unique_words': len(set(content_words)),
            'lexical_diversity': len(set(content_words)) / len(content_words),
            'avg_word_length': np.mean([len(word) for word in content_words]),
            'long_words_ratio': sum(1 for word in content_words if len(word) > 6) / len(content_words),
            'stopwords_ratio': len([w for w in words if w in self.stop_words]) / len(words)
        }

    def get_grammar_features(self, text):
        """Get grammar-related features"""
        doc = self.nlp(text)
        total_tokens = len(doc)

        if total_tokens == 0:
            return {
                'noun_ratio': 0,
                'verb_ratio': 0,
                'adj_ratio': 0,
                'adv_ratio': 0,
                'pronoun_ratio': 0,
                'conjunction_ratio': 0
            }

        pos_counts = Counter([token.pos_ for token in doc])

        return {
            'noun_ratio': pos_counts['NOUN'] / total_tokens,
            'verb_ratio': pos_counts['VERB'] / total_tokens,
            'adj_ratio': pos_counts['ADJ'] / total_tokens,
            'adv_ratio': pos_counts['ADV'] / total_tokens,
            'pronoun_ratio': pos_counts['PRON'] / total_tokens,
            'conjunction_ratio': pos_counts['CCONJ'] / total_tokens
        }

    def get_sentence_complexity(self, text):
        """Get sentence complexity features"""
        doc = self.nlp(text)
        sentences = list(doc.sents)

        if not sentences:
            return {
                'avg_sentence_length': 0,
                'avg_clauses_per_sentence': 0,
                'compound_sentence_ratio': 0,
                'complex_sentence_ratio': 0
            }

        clause_counts = []
        compound_sentences = 0
        complex_sentences = 0

        for sent in sentences:
            clause_count = 1 + len([token for token in sent
                                  if token.dep_ in ['advcl', 'ccomp', 'xcomp']])
            clause_counts.append(clause_count)

            if any(token.dep_ == 'conj' for token in sent):
                compound_sentences += 1

            if clause_count > 1:
                complex_sentences += 1

        return {
            'avg_sentence_length': np.mean([len(sent) for sent in sentences]),
            'avg_clauses_per_sentence': np.mean(clause_counts),
            'compound_sentence_ratio': compound_sentences / len(sentences),
            'complex_sentence_ratio': complex_sentences / len(sentences)
        }

    def get_coherence_features(self, text):
        """Get coherence-related features"""
        sentences = sent_tokenize(text)

        if len(sentences) < 2:
            return {
                'avg_similarity': 0,
                'similarity_variance': 0,
                'transition_words_ratio': 0
            }

        sent_embeddings = [self.nlp(sent).vector for sent in sentences]
        similarities = []
        for i in range(len(sent_embeddings)-1):
            similarity = np.dot(sent_embeddings[i], sent_embeddings[i+1])
            similarities.append(float(similarity))

        transition_words = set(['however', 'therefore', 'furthermore', 'moreover',
                              'nevertheless', 'thus', 'meanwhile', 'consequently',
                              'similarly', 'in contrast', 'additionally'])

        words = word_tokenize(text.lower())
        transition_count = sum(1 for word in words if word in transition_words)

        return {
            'avg_similarity': np.mean(similarities),
            'similarity_variance': np.var(similarities),
            'transition_words_ratio': transition_count / len(words)
        }

    def get_style_features(self, text):
        """Get style-related features"""
        sentiment = self.sia.polarity_scores(text)
        doc = self.nlp(text)

        formal_indicators = len([token for token in doc
                               if token.pos_ in ['NOUN', 'ADJ', 'NUM']])
        informal_indicators = len([token for token in doc
                                 if token.pos_ in ['INTJ', 'PART']])

        return {
            'sentiment_positive': sentiment['pos'],
            'sentiment_negative': sentiment['neg'],
            'sentiment_neutral': sentiment['neu'],
            'sentiment_compound': sentiment['compound'],
            'formality_score': formal_indicators / (informal_indicators + 1)
        }

    def get_argumentation_features(self, text):
        """Get argumentation-related features"""
        text_lower = text.lower()
        words = word_tokenize(text_lower)
        total_words = len(words)

        if total_words == 0:
            return {
                'claims_ratio': 0,
                'evidence_ratio': 0,
                'counter_argument_ratio': 0
            }

        argument_markers = {
            'claims': ['argue', 'claim', 'believe', 'suggest', 'think', 'conclude'],
            'evidence': ['because', 'since', 'therefore', 'consequently', 'research', 'study', 'evidence'],
            'counter_arguments': ['however', 'although', 'nevertheless', 'despite', 'contrary', 'whereas']
        }

        counts = {
            f'{key}_ratio': sum(1 for marker in markers
                               if marker in text_lower) / total_words
            for key, markers in argument_markers.items()
        }

        return counts

def analyze_single_essay(essay_text):
    """Function to analyze a single essay"""
    analyzer = SingleEssayAnalyzer()
    features = analyzer.analyze_essay(essay_text)
    return features



In [37]:
import nltk
nltk.download('punkt_tab')

# Predict a final score for a new essay
def predict_final_score(essay_text, model_path):
    # Assume `extract_features` is a function that computes all features for a given essay
    # Load the pre-trained model and scaler
    model = load_model(model_path)
    analyzer = SingleEssayAnalyzer()
    essay_features = analyzer.analyze_essay(essay_text).values
    print(essay_features)
    essay_features_scaled = scaler.transform([essay_features[0]])
    return best_model.predict(essay_features_scaled)[0][0]

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [38]:
# Example usage
if __name__ == "__main__":
    # Example essay
    sample_essay = """
    Artificial intelligence (AI) is becoming a part of education. It personalizes learning by making content fit students’ needs and speeds up tasks like grading. This means teachers don’t have to spend as much time doing boring things, so they can focus on teaching. AI also helps students with disabilities by providing tools like text-to-speech or translation apps. However, there are some issues. Privacy is a concern because AI collects data about students. Also, not everyone can afford AI tools, which might make things unfair. Some people think AI could reduce human interaction, which is important in learning. Overall, AI is useful in education but comes with some problems. It can help a lot if used carefully.
    """
    # Specify paths to the model and scaler
    model_file = "/content/best_model.keras"  # Replace with your actual model path


    # Predict score
    score = predict_final_score(sample_essay, model_file)

    # Display results
    print("\nEssay Analysis Results:")
    print(score)

[[1.37000000e+02 1.00000000e+01 7.27000000e+02 1.37000000e+01
  5.30656934e+00 1.00000000e+00 5.90000000e+01 8.19444444e-01
  6.11111111e+00 3.88888889e-01 3.21167883e-01 2.11267606e-01
  1.61971831e-01 4.92957746e-02 4.22535211e-02 5.63380282e-02
  2.11267606e-02 1.42000000e+01 1.90000000e+00 3.00000000e-01
  6.00000000e-01 2.96075467e+00 1.15026277e+00 7.29927007e-03
  1.41000000e-01 5.80000000e-02 8.02000000e-01 7.57900000e-01
  7.40000000e+00 7.29927007e-03 7.29927007e-03 7.29927007e-03]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step

Essay Analysis Results:
7.454677
