In [1]:
# reading training data
import pandas as pd
train_text = pd.read_csv("review_text_train.csv")
meta_data = pd.read_csv("review_meta_train.csv")
labels = meta_data["rating"]
df_train_text = pd.concat([train_text, pd.DataFrame(labels)],axis=1)

In [2]:
import numpy as np
import re
import nltk

In [3]:
X_train, y_train = df_train_text['review'].values, df_train_text['rating'].values

In [4]:
wordnet = nltk.WordNetLemmatizer()

In [5]:
# cleaning training data
documents = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sentence in range(0, len(X_train)):
    document = re.sub(r'\W', ' ', str(X_train[sentence]))
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document) 
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = document.lower()
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    documents.append(document)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
# reading testing data
df_test = pd.read_csv("review_text_test.csv")
X_test = df_test['review']

In [9]:
# cleaning testing data
# yes this should have been a function - sorry:(
docs = []

from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

for sentence in range(0, len(X_test)):
    doc = re.sub(r'\W', ' ', str(X_test[sentence]))
    doc = re.sub(r'\^[a-zA-Z]\s+', ' ', doc) 
    doc = re.sub(r'\s+', ' ', doc, flags=re.I)
    doc = re.sub(r'\s+[a-zA-Z]\s+', ' ', doc)
    doc = doc.lower()
    doc = doc.split()
    doc = [stemmer.lemmatize(word) for word in doc]
    doc = ' '.join(doc)
    docs.append(doc)

In [10]:
# fitting X_test and X_train to train
vectorizer = CountVectorizer()
vectorizers = vectorizer.fit(documents)
X_train_freq = vectorizers.transform(documents)
X_test_freq = vectorizers.transform(docs)

In [11]:
# tranforming frequency to tfidf
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_train_tfidf = tfidf.fit_transform(X_train_freq)
X_test_tfidf = tfidf.fit_transform(X_test_freq)

In [12]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression 
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.model_selection import train_test_split

In [13]:
# Splitting data (75/25)
# This is used just for evaluation of classifiers
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y_train, test_size=0.25, random_state=0)

In [14]:
# Naive Bayes classifier                     This is if we want to predict the actual test data
classifier_1 = MultinomialNB().fit(X_train, y_train) # classifier_1 = MultinomialNB().fit(X_train_freq, y_train)
classifier_1_y_pred = classifier_1.predict(X_test) # classifier_1_y_pred = classifier_1.predict(X_test_freq) 
print(classification_report(y_test, classifier_1_y_pred))
print(confusion_matrix(y_test, classifier_1_y_pred))
print(accuracy_score(y_test, classifier_1_y_pred))

              precision    recall  f1-score   support

           1       0.00      0.00      0.00       581
           3       0.43      0.00      0.00      1639
           5       0.68      1.00      0.81      4797

    accuracy                           0.68      7017
   macro avg       0.37      0.33      0.27      7017
weighted avg       0.57      0.68      0.56      7017

[[   0    4  577]
 [   0    3 1636]
 [   0    0 4797]]
0.6840530141085934


  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
# Logistic Regression classifier                                     This is if we want to predict the actual test data
classifier_2 = LogisticRegression(max_iter=500).fit(X_train, y_train) #classifier_2 = LogisticRegression(max_iter=500).fit(X_train_freq, y_train)
classifier_2_y_pred = classifier_2.predict(X_test) # classifier_2_y_pred = classifier_2.predict(X_test_freq)
print(classification_report(y_test, classifier_2_y_pred))
print(confusion_matrix(y_test, classifier_2_y_pred))
print(accuracy_score(y_test, classifier_2_y_pred))

              precision    recall  f1-score   support

           1       0.90      0.65      0.75       581
           3       0.80      0.68      0.74      1639
           5       0.89      0.96      0.93      4797

    accuracy                           0.87      7017
   macro avg       0.86      0.76      0.80      7017
weighted avg       0.87      0.87      0.87      7017

[[ 375  121   85]
 [  31 1117  491]
 [   9  160 4628]]
0.8721675929884566


In [16]:
from mlxtend.classifier import StackingCVClassifier
from sklearn.ensemble import RandomForestClassifier

In [17]:
# Stacking classifier
stacked_classifier_1 = RandomForestClassifier(n_estimators=100, min_samples_split=50, min_samples_leaf=1, criterion='gini', n_jobs=-1)
stacked_classifier_2 = MultinomialNB()
stacked_classifier_3 = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=500, n_jobs=-1)
logistic_regression = LogisticRegression(solver='lbfgs', multi_class='multinomial', max_iter=500, n_jobs=-1)
stacked_classifier = StackingCVClassifier(classifiers=[stacked_classifier_1, stacked_classifier_2, stacked_classifier_3], meta_classifier=logistic_regression, random_state=0, use_probas=True)

In [18]:
                                                   #This is if we want to predict the actual test data
stacked_classifier.fit(X_train, y_train)   # stacked_classifier.fit(X_train_tfidf, y_train_tfidf)
stacked_y_pred = stacked_classifier.predict(X_test) # stacked_y_pred = stacked_classifier.predict(X_test_tfidf)
print(classification_report(y_test, stacked_y_pred))
print(confusion_matrix(y_test, stacked_y_pred))
print(accuracy_score(y_test, stacked_y_pred))

              precision    recall  f1-score   support

           1       0.77      0.79      0.78       581
           3       0.76      0.72      0.74      1639
           5       0.92      0.93      0.93      4797

    accuracy                           0.87      7017
   macro avg       0.82      0.82      0.82      7017
weighted avg       0.87      0.87      0.87      7017

[[ 459   87   35]
 [ 100 1184  355]
 [  36  277 4484]]
0.8731651703006983


In [19]:
#https://stackabuse.com/text-classification-with-python-and-scikit-learn/ 
#Similar for the data-cleaning