In [None]:
#import necessary libraries
import pandas as pd
from ast import literal_eval
import nltk
import sys # needed this for certain print options during debugging
import numpy as np
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score 
from sklearn.metrics import ConfusionMatrixDisplay
import pickle




In [None]:
#load the dataset
pd.set_option('display.max_colwidth', None)
df = pd.read_csv('data/final_dataset.csv', converters={'ProcessTokens': literal_eval})
df.head(1)

In [None]:
#convert the datatype of processtokens fromm list to string, so that our vectorization works right.
df['ProcessTokens'] = df['ProcessTokens'].astype("string")

In [None]:
#train_test_split before normalization
from sklearn.model_selection import train_test_split

X = df.drop(columns=['user_suggestion','review_id', 'year']).copy()
y = df['user_suggestion']

X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.8)

X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)

#print(X_train.shape), print(y_train.shape)
#print(X_valid.shape), print(y_valid.shape)
#print(X_test.shape), print(y_test.shape)

In [None]:
#Check for NaN-Values
#print(pd.isna(X_train).values)
#print(pd.isna(y_train).values)

In [None]:
#define text vectorizer
count_vectorizer = CountVectorizer()

In [None]:
#text vectorization with count_vectorizer
#vectorization on train_dataset
np.set_printoptions(threshold=sys.maxsize)
def create_vectorized_dataset_count(data):
    list_of_count_vectors = []
    data = data.tolist() # converting to a list
    count_vector = count_vectorizer.fit_transform(data)
    count_array = count_vector.toarray()
    dftemp = pd.DataFrame(data=count_array,columns = count_vectorizer.get_feature_names())
    #return count_array # this is the 2d
    return count_vector

count_vector = create_vectorized_dataset_count(X_train['ProcessTokens']) # this becomes our count vectorization vector.


In [None]:
#transform count test data

test_count_vector = count_vectorizer.transform(X_test['ProcessTokens'])
print("n_samples: %d, n_features: %d" % test_count_vector.shape)

In [None]:
#Naive Bayes with count vector
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(count_vector, y_train)

y_prediction = naive_bayes_classifier.predict(test_count_vector)

print("Multinomial Naive Bayes model accuracy(in %):", metrics.accuracy_score(y_test, y_prediction)*100)

In [None]:
# Knn
knn = KNeighborsClassifier(n_neighbors=7)
knn_clf = knn.fit(count_vector, y_train)
knn_y_pred = knn_clf.predict(test_count_vector)


print("kNN model accuracy(in %):", metrics.accuracy_score(y_test, knn_y_pred)*100)


In [None]:
# lr
lr_clf = LogisticRegression()
lr_clf.fit(count_vector, y_train)
lr_y_pred = lr_clf.predict(test_count_vector)


print("LR model accuracy(in %):", metrics.accuracy_score(y_test, lr_y_pred)*100)

In [None]:
#define confusion matriy
cm = confusion_matrix(y_test, lr_y_pred)
print ("Accuracy : ", accuracy_score(y_test, lr_y_pred))

In [None]:
#display confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm,display_labels=lr_clf.classes_)
disp.plot()
plt.show()

In [None]:
pickle.dump(lr_clf, open('lr_model.pkl', 'wb'))

In [None]:
user_input = "I'm scared and hearing creepy voices. So I'll pause for a moment and write a review while I wait for my heart beat to return to atleast somewhat calmer times. This game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood (but more bubble and 'clean'). Hello 1990's.What charactes there are (that isnot trying to kill me) were likable and a bit odd. I did do a few noob things though, such as:Oh look a class room full of ghosts from dead children, lets shine my flashlight on them and stand there staring at them..Or, hmm creepy music, I'll turn around and see if I can see what's chasing me.Never before in a game have I been this afraid of finding a locked door."
user_input_count_vector = count_vectorizer.transform([user_input])

In [None]:
pickled_model = pickle.load(open('lr_model.pkl', 'rb'))
pickled_model.predict(user_input_count_vector)