In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('data', names=['number', 'content', 'feedback', 'review'])

In [None]:
df.info()

In [None]:
df['feedback'].value_counts()

In [None]:
rating_dict = {'Negative':0, 'Positive':1, 'Neutral':2, 'Irrelevant':3}
df['stars'] = df['feedback'].apply(lambda x: rating_dict[x])

In [None]:
df.isna().sum()

In [None]:
df = df.dropna()
df.info()

In [None]:
df['length'] = df['review'].str.len()
df['length'].plot(bins=100, kind='hist')

In [None]:
review_as_one =" ".join(df['review'].astype(str).tolist())

In [None]:
from wordcloud import WordCloud

plt.figure(figsize=(10,10))
plt.imshow(WordCloud().generate(review_as_one))

In [None]:
reviews_df = df.drop(['number', 'feedback', 'length'], axis=1)

In [None]:
reviews_df['content'].nunique()

In [None]:
freq = reviews_df['content'].value_counts(normalize=True)
reviews_df['freq'] = reviews_df['content'].map(freq)

In [None]:
reviews_df = reviews_df.drop('content', axis=1)

In [None]:
import string
import nltk
from nltk.corpus import stopwords

def review_cleaning(review):
    test_punc_removed = [char for char in review if char not in string.punctuation]
    test_punc_removed_join = ''.join(test_punc_removed)
    test_punc_removed_join_clean = [word for word in test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return test_punc_removed_join_clean

In [None]:
nltk.download('stopwords')

In [None]:
# df_clean = reviews_df['review'].astype(str).apply(review_cleaning)
# df_clean

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = review_cleaning)
reviews_countvectorizer = vectorizer.fit_transform(reviews_df['review'].astype(str))

In [None]:
reviews_df = reviews_df.drop(['review'], axis=1)
reviews_vector = pd.DataFrame(reviews_countvectorizer.toarray())

In [None]:
reviews_df

In [None]:
reviews_vector['stars'] = reviews_df['stars']
reviews_vector['freq'] = reviews_df['freq']

In [None]:
# reviews_df_new = pd.concat([reviews_df, reviews_vector], axis=1)

In [None]:
X = reviews_vector.drop(['stars'], axis=1)
y = reviews_vector['stars']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True)

In [None]:
import pickle

pickle.dump(classifier, open("data", "wb"))