In [39]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

dfL = pd.read_csv("data/trainLemmatized.csv")
dfS = pd.read_csv("data/trainStemmed.csv")

In [40]:
seed = 50
xTrainL, xTestL, yTrainL, yTestL = train_test_split(dfL["data"], dfL['labels'], test_size = 0.2, random_state = seed)
xTrainS, xTestS, yTrainS, yTestS = train_test_split(dfS["data"], dfS['labels'], test_size = 0.2, random_state = seed)

In [41]:
xTrainL

301827                                     good brush price
104581    expected larger size took month receive spacer...
16272     superb customer service ever reorder recommend...
258216    work great clasp various jewelry project origi...
131419    looking nice blank book journaling definitely ...
                                ...                        
317510                              item exactly advertised
321502        nice sturdy dauber worked well pleased result
153709                                           work great
239499    template cut beautiful tree intricate tree orn...
103904    ok looking forward mold really wanted start ma...
Name: data, Length: 296368, dtype: object

In [42]:
xTrainL.reset_index(drop=True, inplace=True)
xTestL.reset_index(drop=True, inplace=True)
xTrainS.reset_index(drop=True, inplace=True)
xTestS.reset_index(drop=True, inplace=True)
yTrainL.reset_index(drop=True, inplace=True)
yTestL.reset_index(drop=True, inplace=True)
yTrainS.reset_index(drop=True, inplace=True)
yTestS.reset_index(drop=True, inplace=True)

In [43]:
import math
def replaceNAN(txt):
    try:
        if math.isnan(txt):
            return ""
        else:
            return txt
    except:
        return txt

In [44]:
for i in range(len(xTrainL)):
    xTrainL[i] = replaceNAN(xTrainL[i])
for i in range(len(xTrainS)):
    xTrainS[i] = replaceNAN(xTrainS[i])
for i in range(len(xTestL)):
    xTestL[i] = replaceNAN(xTestL[i])
for i in range(len(xTestS)):
    xTestS[i] = replaceNAN(xTestS[i])

In [47]:
import warnings
warnings.filterwarnings('ignore')
def identity(x):
    return x

unigram = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (1, 1), tokenizer = identity,token_pattern=None, preprocessor = identity)),
    ('classifier', LogisticRegression())
])

# uni-gram representation with LogisticRegression for lemmatized documents
print("Unigram")
print("Lemmatized Logistic Regression")
unigram.fit(xTrainL, yTrainL)
pred = unigram.predict(xTestL)
print(classification_report(yTestL, pred))
print("")
# uni-gram representation with SGDClasLogisticRegressionsifier for stemmed documents
print("Stemmed Logistic Regression")
unigram.fit(xTrainS, yTrainS)
pred = unigram.predict(xTestS)
print(classification_report(yTestS, pred))
print("")

Unigram
Lemmatized Logistic Regression
              precision    recall  f1-score   support

           1       0.12      0.00      0.01      2161
           2       0.00      0.00      0.00      1916
           3       0.15      0.01      0.01      4284
           4       0.25      0.01      0.02      9069
           5       0.77      0.99      0.87     56662

    accuracy                           0.76     74092
   macro avg       0.26      0.20      0.18     74092
weighted avg       0.63      0.76      0.67     74092


Stemmed Logistic Regression
              precision    recall  f1-score   support

           1       0.13      0.01      0.01      2150
           2       0.16      0.00      0.00      1931
           3       0.21      0.00      0.01      4255
           4       0.24      0.01      0.02      9224
           5       0.77      0.99      0.86     56532

    accuracy                           0.76     74092
   macro avg       0.30      0.20      0.18     74092
weighted 

In [48]:
import warnings
warnings.filterwarnings('ignore')
def identity(x):
    return x

bigram = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (2, 2), tokenizer = identity,token_pattern=None, preprocessor = identity)),
    ('classifier', LogisticRegression())
])

# bi-gram representation with LogisticRegression for lemmatized documents
print("Bigram")
print("Lemmatized Logistic Regression")
bigram.fit(xTrainL, yTrainL)
pred = bigram.predict(xTestL)
print(classification_report(yTestL, pred))
print("")
# bi-gram representation with SGDClasLogisticRegressionsifier for stemmed documents
print("Stemmed Logistic Regression")
bigram.fit(xTrainS, yTrainS)
pred = bigram.predict(xTestS)
print(classification_report(yTestS, pred))
print("")

Bigram
Lemmatized Logistic Regression
              precision    recall  f1-score   support

           1       0.41      0.16      0.23      2161
           2       0.19      0.03      0.05      1916
           3       0.32      0.07      0.12      4284
           4       0.35      0.05      0.09      9069
           5       0.79      0.98      0.87     56662

    accuracy                           0.77     74092
   macro avg       0.41      0.26      0.27     74092
weighted avg       0.68      0.77      0.69     74092


Stemmed Logistic Regression
              precision    recall  f1-score   support

           1       0.42      0.14      0.21      2150
           2       0.17      0.02      0.03      1931
           3       0.32      0.08      0.13      4255
           4       0.35      0.05      0.08      9224
           5       0.78      0.98      0.87     56532

    accuracy                           0.77     74092
   macro avg       0.41      0.25      0.26     74092
weighted a

In [49]:
import warnings
warnings.filterwarnings('ignore')
def identity(x):
    return x

trigram = Pipeline([
    ('vectorizer', CountVectorizer(ngram_range = (3, 3), tokenizer = identity,token_pattern=None, preprocessor = identity)),
    ('classifier', LogisticRegression())
])

# tri-gram representation with LogisticRegression for lemmatized documents
print("Trigram")
print("Lemmatized Logistic Regression")
trigram.fit(xTrainL, yTrainL)
pred = trigram.predict(xTestL)
print(classification_report(yTestL, pred))
print("")
# tri-gram representation with SGDClasLogisticRegressionsifier for stemmed documents
print("Stemmed Logistic Regression")
trigram.fit(xTrainS, yTrainS)
pred = trigram.predict(xTestS)
print(classification_report(yTestS, pred))
print("")

Trigram
Lemmatized Logistic Regression
              precision    recall  f1-score   support

           1       0.55      0.40      0.46      2161
           2       0.28      0.09      0.13      1916
           3       0.40      0.20      0.27      4284
           4       0.46      0.13      0.20      9069
           5       0.82      0.98      0.89     56662

    accuracy                           0.79     74092
   macro avg       0.50      0.36      0.39     74092
weighted avg       0.73      0.79      0.74     74092


Stemmed Logistic Regression
              precision    recall  f1-score   support

           1       0.55      0.38      0.45      2150
           2       0.28      0.09      0.14      1931
           3       0.40      0.20      0.27      4255
           4       0.46      0.12      0.19      9224
           5       0.82      0.98      0.89     56532

    accuracy                           0.79     74092
   macro avg       0.50      0.35      0.39     74092
weighted 