In [None]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textstat import textstat
import re
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
df = pd.read_csv('processed_essays.csv')


# text cleaning

In [None]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [None]:
df['cleaned_essay'] = df['essay'].apply(clean_text)

In [None]:
df['flesch_reading_ease'] = df['cleaned_essay'].apply(lambda x: textstat.flesch_reading_ease(x))
df['gunning_fog'] = df['cleaned_essay'].apply(lambda x: textstat.gunning_fog(x))
df['avg_sentence_length'] = df['cleaned_essay'].apply(lambda x: np.mean([len(sentence.split()) for sentence in nltk.sent_tokenize(x)]))

# TF-IDF features


In [None]:
tfidf = TfidfVectorizer(max_features=100)
tfidf_matrix = tfidf.fit_transform(df['cleaned_essay'])
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf.get_feature_names_out())

In [None]:
df = pd.concat([df, tfidf_df], axis=1)
