In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report
import pandas as pd
import numpy as np
from seaborn import heatmap
import yellowbrick
from yellowbrick.classifier import ROCAUC
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.classifier import ClassPredictionError
from sklearn.linear_model import LogisticRegression
%matplotlib inline
import re
import matplotlib
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 
from sacremoses import MosesDetokenizer
from num2words import num2words

In [None]:
# function for cleaning text

# REMOVED TAGS and STOPWORDS
# REMOVED PUNCTUATION
# LEMMATIZE
# Used num2words module to CHANGE DIGITS INTO STRINGS

def clean_text(text_raw):
    text_ready = []
    dt = MosesDetokenizer()
    stops = stopwords.words('english')
    lem = WordNetLemmatizer()

    for t in text_raw:

        cleaned = []
        text = re.sub(r'[\\]',' \\ ', str(t))
        text = re.sub(r'[\d]', lambda x: num2words(x.group()), text)
        text = re.sub(r'[^\w\s]','', text)
        text = nltk.word_tokenize(text.lower())

        for word in text:
            word = word.strip()
            if word not in stops:
                cleaned.append(lem.lemmatize(word))
        text_ready.append(dt.detokenize(cleaned, return_str=True))

    return text_ready

In [None]:
# Taken (modified) from https://stackoverflow.com/questions/28200786/how-to-plot-scikit-learn-classification-report
def plot_CR(prediction, ytest=y_test):
    report_data = []
    for label, metrics in classification_report(ytest, prediction, output_dict=True).items():
        metrics['label'] = label
        report_data.append(metrics)

    report_df = pd.DataFrame(
        report_data, 
        columns=['label', 'precision', 'recall', 'f1-score', 'support']
    )

    report_df['labelsupport'] = [f'{label} (n={support})' 
                                 for label, support in zip(report_df.label, report_df.support)]

    # Plot the chart the same way, but use `labelsupport` as the x-axis.
    report_df.plot(y=['precision', 'recall', 'f1-score'], x='labelsupport', kind='bar')

In [None]:
# Open sample file

with open('./2sample_50000_1.json', 'r', errors='ignore' ) as json_file: 
        sample = pd.read_json(json_file)

In [None]:
# Clean and preprocess Sample using cleaning function -> Store in new Series in the Dataframe

sample['clean_text'] = clean_text(sample['text'])

In [None]:
sample.to_json(r'./clean_2sample_50000_1.json')

In [None]:
# Print first review from sample before and after cleaning 

print(sample['text'][0])
print(sample['clean_text'][0])

In [None]:
# Splitting up Data into training and testing

X_train_text, X_test_text,y_train,y_test=train_test_split(sample['clean_text'],
                                              sample['stars'], train_size=0.6,
                                              test_size=0.4,stratify=sample['stars'],
                                              random_state=0)

In [None]:
# Vectorizing using Count-Vectorizer - transforming into bigrams

vect=CountVectorizer(ngram_range=(1,2),min_df=5).fit(X_train_text)
X_train = vect.transform(X_train_text)
X_test = vect.transform(X_test_text)

feature_names=np.array(vect.get_feature_names())

In [None]:
# Check length of feature names and print first 20

print(len(feature_names))
print(feature_names[:20])

In [None]:
def classifyVisualize(classifr, xtrain, ytrain, xtest, ytest):
    # Visualizing ROCAUC Curves
    ROCvisualizer = ROCAUC(classifr)

    ROCvisualizer.fit(xtrain, ytrain)  # Fit the training data to the visualizer
    ROCvisualizer.score(xtest, ytest)  # Evaluate the model on the test data
    g = ROCvisualizer.poof()             # Draw/show/poof the data
    
    # Visualizing classification report
    cr = yellowbrick.classifier.ClassificationReport(classifr, classes=[1, 2, 3, 4, 5])
    cr.fit(xtrain, ytrain)
    cr.score(xtest, ytest)
    h = cr.poof()
    
    # Visualizing class prediction error
    CPEvisualizer = ClassPredictionError(classifr, classes=[1, 2, 3, 4, 5])
    # Fit the training data to the visualizer
    CPEvisualizer.fit(xtrain, ytrain)

    # Evaluate the model on the test data
    CPEvisualizer.score(X_test, ytest)

    # Draw visualization
    cpe = CPEvisualizer.poof()

In [None]:
## Using LogisticRegression as Classifier - set to multinomial as it's multilabel classification
# Using Gridsearch, to try different C scores

LRclassifier = LogisticRegression(solver='newton-cg', multi_class='multinomial', C=0.1)


gs = GridSearchCV(estimator=LRclassifier, param_grid={'C': [0.001, 0.01, 0.1, 1]}, cv=5)

In [None]:
gs.fit(X_train, y_train)

In [None]:
print(gs.best_params_) 
print(gs.best_score_) 

In [None]:
# Fitting Classifier and getting predictions

LR_model = LRclassifier.fit(X_train, y_train)
LR_predict = LR_model.predict(X_test)

In [None]:
classifyVisualize(classifr=LRclassifier, xtrain=X_train, ytrain=y_train, xtest=X_test, ytest=y_test)

In [None]:
# Plotting Classification Report 

plot_CR(LR_predict)
print(classification_report(LR_predict, y_test))

In [None]:
# Using Dummy Classifier to compare results 

from sklearn.dummy import DummyClassifier 
dc = DummyClassifier(strategy='stratified')
D_model = dc.fit(X_train, y_train)
D_predict = D_model.predict(X_test)

In [None]:
classifyVisualize(classifr=dc, xtrain=X_train, ytrain=y_train, xtest=X_test, ytest=y_test)

In [None]:
# Plotting Classification Report 

plot_CR(D_predict)
print(classification_report(D_predict, y_test))

In [None]:
# %pip install yellowbrick

In [None]:
# Using RandomForest Classifier

from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

# Testing model
rfcpredict = rfc.predict(X_test)

In [None]:
classifyVisualize(classifr=rfc, xtrain=X_train, ytrain=y_train, xtest=X_test, ytest=y_test)

In [None]:
plot_CR(rfcpredict)
print(classification_report(rfcpredict, y_test))

In [None]:
# Using Tfidf Vectorizer to compare

tvect=TfidfVectorizer(ngram_range=(1,2),min_df=5).fit(X_train_text)
X_trainT = tvect.transform(X_train_text)
X_testT = tvect.transform(X_test_text)

feature_namesT=np.array(tvect.get_feature_names())

In [None]:
classifyVisualize(classifr=LRclassifier, xtrain=X_trainT, ytrain=y_train, xtest=X_testT, ytest=y_test)

In [None]:
# Training Logistic Regression model with Tfidf

LR_modelT = LRclassifier.fit(X_trainT, y_train)
LR_predictT = LR_model.predict(X_testT)


In [None]:
# Plotting Classification Report 

plot_CR(LR_predictT)
print(classification_report(LR_predictT, y_test))


In [None]:
# Testing using OneVsRestClassifier( with MultinomialNB and TfidfVectorizer)

classifyVisualize(classifr=OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None)),
                  xtrain=X_trainT, ytrain=y_train, xtest=X_testT, ytest=y_test)

In [None]:
# Testing using OneVsRestClassifier( with MultinomialNB and TfidfVectorizer)

# Inspiration and code for pipeline parameters taken from amalgamation of StackOverflow articles
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])
NB_pipeline.fit(X_train_text, y_train)
NBprediction = NB_pipeline.predict(X_test_text)

plot_CR(NBprediction)
print(classification_report(NBprediction, y_test))

In [None]:
# Testing using OneVsRestClassifier( with LinearSVC() and TfidfVectorizer)

classifyVisualize(classifr=OneVsRestClassifier(LinearSVC(), n_jobs=1),
                  xtrain=X_trainT, ytrain=y_train, xtest=X_testT, ytest=y_test)

In [None]:
# Testing using OneVsRestClassifier( with LinearSVC() and TfidfVectorizer)

# Inspiration and code for pipeline parameters taken from amalgamation of StackOverflow articles
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=stop_words)),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])

SVC_pipeline.fit(X_train_text, y_train)
# compute the testing accuracy
SVCprediction = SVC_pipeline.predict(X_test_text)

plot_CR(SVCprediction)
print(classification_report(SVCprediction, y_test))