In [1]:
import pandas as pd
import spacy.cli
import numpy as np

In [2]:
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')

wv.init_sims(replace=True)
wv.save('wv_model.model')

  wv.init_sims(replace=True)


In [3]:
df = pd.read_csv("data/kaggle_train_diff_essays.csv")
df.shape

(6020, 5)

In [4]:
df = df[['essay_id', 'essay_set', 'essay', 'domain1_score']]
df.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score
0,5978,3,The features of the setting affect the cyclist...,1
1,5979,3,The features of the setting affected the cycli...,2
2,5980,3,Everyone travels to unfamiliar places. Sometim...,1
3,5981,3,I believe the features of the cyclist affected...,1
4,5982,3,The setting effects the cyclist because of the...,2


Preprocess and Vectorize Text reference: https://github.com/codebasics/nlp-tutorials/blob/main/16_word_vectors_gensim_text_classification/gensim_w2v_google.ipynb

In [5]:
import joblib

spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

nlp_file = open("nlp_file_25_nov_2023.pkl", "wb")
joblib.dump(nlp, nlp_file)
nlp_file.close()

def preprocess_and_vectorize(text):
    doc = nlp(text)

    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    vectors = []
    for token in filtered_tokens:
        try:
            vectors.append(wv[token])
        except KeyError:
            continue

    # get mean vector of all words in essay

    return wv.get_mean_vector(vectors)

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
# Available essay sets: 3, 4, 6, and 8
# use essay set 3 and 4

df_essay = df[df['essay_set'].isin([3,4])]
df_essay.shape

(3497, 4)

In [7]:
df_essay['vector'] = df_essay['essay'].apply(lambda text: preprocess_and_vectorize(text))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_essay['vector'] = df_essay['essay'].apply(lambda text: preprocess_and_vectorize(text))


In [8]:
df_essay.head()

Unnamed: 0,essay_id,essay_set,essay,domain1_score,vector
0,5978,3,The features of the setting affect the cyclist...,1,"[0.0033076948, 0.017976768, -0.004153249, -0.0..."
1,5979,3,The features of the setting affected the cycli...,2,"[0.013694342, 0.021794971, -0.0032590167, 0.01..."
2,5980,3,Everyone travels to unfamiliar places. Sometim...,1,"[0.030827984, 0.008283077, 0.01512673, 0.02562..."
3,5981,3,I believe the features of the cyclist affected...,1,"[0.007179262, 0.021053521, -0.003954915, 0.020..."
4,5982,3,The setting effects the cyclist because of the...,2,"[0.022492502, 0.027344149, -0.005746396, 0.026..."


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_essay.vector.values,
    df_essay.domain1_score,
    test_size=0.2,
    random_state=123,
    stratify=df_essay.domain1_score
)

Convert array of arrays to 2d array

In [14]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

Gradient Boosting Classifier

In [15]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn import metrics
from sklearn.metrics import cohen_kappa_score

clf = GradientBoostingClassifier()

clf.fit(X_train_2d, y_train)

y_pred = clf.predict(X_test_2d)

#save model
model_file = open("essay_classifier_25_nov_2023.pkl", "wb")
joblib.dump(clf, model_file)
model_file.close()

# Evaluate the model on the evaluation metric. "Quadratic mean averaged Kappa"
result = cohen_kappa_score(y_test,y_pred,weights='quadratic')
print("Kappa Score: {}".format(result))

print(classification_report(y_test, y_pred))

metrics.confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Kappa Score: 0.655245673671199
              precision    recall  f1-score   support

           0       0.65      0.37      0.47        70
           1       0.64      0.65      0.65       249
           2       0.54      0.64      0.59       246
           3       0.57      0.49      0.53       135

    accuracy                           0.59       700
   macro avg       0.60      0.54      0.56       700
weighted avg       0.59      0.59      0.59       700



Predicted,0,1,2,3,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,26,37,7,0,70
1,14,162,63,10,249
2,0,48,158,40,246
3,0,6,63,66,135
All,40,253,291,116,700
