In [1]:
from main import *

Loading GloVe model
GloVe model loaded successfully


In [2]:
import pandas as pd
import numpy as np

import xgboost as xgb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score

In [3]:
df = pd.read_csv("datasets/essays.csv")

In [8]:
df.head()

Unnamed: 0,text,filename,text_original,sentence_length,label,text_lemmatized
0,asking students evaluate teachers useful tool ...,104.txt,Asking students to evaluate their teachers can...,15,1,asking student evaluate teacher useful tool im...
1,parents often considered best teachers first r...,3.txt,Parents are often considered the best teachers...,17,1,parent often considered best teacher first rol...
2,many reasons people work beyond need money liv...,50.txt,There are many reasons why people work beyond ...,14,1,many reason people work beyond need money live...
3,success life come combination taking risks car...,53.txt,Success in life can come from a combination of...,15,1,success life come combination taking risk care...
4,attending live performance really enjoyable wa...,36.txt,Attending a live performance is really more en...,28,0,attending live performance really enjoyable wa...


In [9]:
tfidf_transformer = text_to_image_transformer(embedding="tfidf",
                                              algorithm ="SIFTS", 
                                              dimension =1,
                                              sigma =.001,
                                              grid_size= 100)
glove_transformer = text_to_image_transformer(embedding ="glove",
                                              algorithm ="SIF"  , 
                                              dimension =0, 
                                              sigma = .001, 
                                              grid_size = 100)

tfidf_model = Pipeline([("persistence images", tfidf_transformer), 
                        ("classifier",  svm.SVC(probability=True))])

glove_model = Pipeline([("persistence images", glove_transformer), 
                        ("classifier",  svm.SVC(probability=True))])

xgboost_model = Pipeline([('tfidf features', TfidfVectorizer(ngram_range=(1,2),min_df=5)),
                          ('classifier', xgb.XGBClassifier())])

tfidf_xgboost_ensemble = Pipeline([['ensemble', 
                                    VotingClassifier(voting="soft",
                                                     estimators=[("tfidf",tfidf_model), 
                                                               ("xgboost", xgboost_model)])]])

In [10]:
X = df["text_lemmatized"].values
y = df["label"].values

In [11]:
models = [tfidf_model,
          glove_model,
          xgboost_model,
          tfidf_xgboost_ensemble]

names = ["TF-IDF", "GloVe", "XGBoost", "TF-IDF/XGBoost Ensemble"]

In [12]:
for i in range(len(models)):
    model = models[i]
    name = names[i]
    
    print(name)
    scores = cross_val_score(model, X=X, y=y, cv=5)
    print("Scores:", scores.round(3))
    print('Cross Validation accuracy: %.3f +/- %.3f \n' 
          % (np.mean(scores),np.std(scores)))

TF-IDF
Scores: [0.804 0.863 0.88  0.8   0.88 ]
Cross Validation accuracy: 0.845 +/- 0.036 

GloVe
Scores: [0.863 0.824 0.68  0.82  0.84 ]
Cross Validation accuracy: 0.805 +/- 0.064 

XGBoost
Scores: [0.902 0.941 0.9   0.88  0.92 ]
Cross Validation accuracy: 0.909 +/- 0.021 

TF-IDF/XGBoost Ensemble
Scores: [0.902 0.961 0.94  0.9   0.98 ]
Cross Validation accuracy: 0.937 +/- 0.032 

