In [3]:
import pandas as pd

In [4]:
data = pd.read_csv("cleaned-text-after-preprocessing.csv")

In [5]:
data.head()

Unnamed: 0,reviewText,overall,rating
0,enjoy vintage book movie enjoyed reading book ...,5,1
1,book reissue old one author born 1910 era say ...,4,1
2,fairly interesting read old style terminology ...,4,1
3,never read amy brewster mystery one really hooked,5,1
4,like period piece clothing lingo enjoy mystery...,4,1


In [6]:
data.isnull().sum()

reviewText    10
overall        0
rating         0
dtype: int64

In [7]:
data.dropna(inplace=True, axis=0)

In [8]:
data.shape

(982587, 3)

In [9]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['reviewText'], data['rating'], test_size=0.30, random_state=42)

In [10]:
X_train.shape, X_test.shape

((687810,), (294777,))

In [11]:
# Bow, tfidf, word2vec
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(
    max_df=0.8,
    min_df=5,
    ngram_range=(1,2)
)

X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_df=0.8,
    min_df=5,
    ngram_range=(1,2),
    sublinear_tf=True
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [13]:
X_train_bow

<687810x952335 sparse matrix of type '<class 'numpy.int64'>'
	with 55038016 stored elements in Compressed Sparse Row format>

In [14]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
multi_nb_bow = MultinomialNB()
multi_nb_bow.fit(X_train_bow,y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [15]:
multi_nb_tfidf = MultinomialNB()
multi_nb_tfidf.fit(X_train_tfidf, y_train)

0,1,2
,alpha,1.0
,force_alpha,True
,fit_prior,True
,class_prior,


In [16]:
y_pred_bow = multi_nb_bow.predict(X_test_bow)

In [17]:
y_pred_tfidf = multi_nb_tfidf.predict(X_test_tfidf)

In [18]:
# Model evaluation
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print(f'Accuracy Score : {accuracy_score(y_pred_bow,y_test)}')
print(f'Classification Report : \n{classification_report(y_pred_bow,y_test)}')
print(f'Confusion Matrix : \n{confusion_matrix(y_pred_bow,y_test)}')


Accuracy Score : 0.940144583871876
Classification Report : 
              precision    recall  f1-score   support

           0       0.72      0.49      0.58     25215
           1       0.95      0.98      0.97    269562

    accuracy                           0.94    294777
   macro avg       0.84      0.73      0.77    294777
weighted avg       0.93      0.94      0.93    294777

Confusion Matrix : 
[[ 12292  12923]
 [  4721 264841]]


In [19]:
print(f'Accuracy Score : {accuracy_score(y_pred_tfidf,y_test)}')
print(f'Classification Report : \n{classification_report(y_pred_tfidf,y_test)}')
print(f'Confusion Matrix : \n{confusion_matrix(y_pred_tfidf,y_test)}')

Accuracy Score : 0.9426074625903649
Classification Report : 
              precision    recall  f1-score   support

           0       0.01      0.97      0.01       101
           1       1.00      0.94      0.97    294676

    accuracy                           0.94    294777
   macro avg       0.50      0.96      0.49    294777
weighted avg       1.00      0.94      0.97    294777

Confusion Matrix : 
[[    98      3]
 [ 16915 277761]]


In [None]:
new_data = ['this was the best i have ever had', 'didnt like it much. very boring']
new_data = [item.lower() for item in new_data]
X_new_bow = bow.transform(new_data)
X_new_tfidf = tfidf.transform(new_data)

In [21]:
predictions_bow = multi_nb_bow.predict(X_new_bow)
predictions_tfidf = multi_nb_tfidf.predict(X_new_tfidf)

In [22]:
predictions_bow, predictions_tfidf

(array([1, 0], dtype=int64), array([1, 1], dtype=int64))

In [23]:
# Try with other models
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [25]:
model.fit(X_train_bow,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [26]:
y_pred_logistic = model.predict(X_test_bow)

In [27]:
print(f'Accuracy Score : {accuracy_score(y_pred_logistic,y_test)}')
print(f'Classification Report : \n{classification_report(y_pred_logistic,y_test)}')
print(f'Confusion Matrix : \n{confusion_matrix(y_pred_logistic,y_test)}')

Accuracy Score : 0.9606923199571201
Classification Report : 
              precision    recall  f1-score   support

           0       0.53      0.71      0.61     12770
           1       0.99      0.97      0.98    282007

    accuracy                           0.96    294777
   macro avg       0.76      0.84      0.80    294777
weighted avg       0.97      0.96      0.96    294777

Confusion Matrix : 
[[  9098   3672]
 [  7915 274092]]


In [45]:
# Word2vec
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import numpy as np

In [32]:
# Chatgpt suggestion
stop_words = set(stopwords.words('english'))
negations = {'not','no','never',"n't"}
stop_words = stop_words - negations

In [33]:
def preprocess(text):
    filtered_words = []
    words = simple_preprocess(text)
    for w in words:
        if w not in stop_words:
            filtered_words.append(w)
    return filtered_words


In [34]:
data['tokens'] = data['reviewText'].apply(preprocess)

In [38]:
data.tokens[30]

['love',
 'anything',
 'chewbacca',
 'family',
 'bonus',
 'kind',
 'nice',
 'see',
 'problem',
 'son']

In [39]:
preprocessed_data = data['reviewText'].apply(preprocess)

In [43]:
model_word2vec = Word2Vec(preprocessed_data, 
                window=5,
                min_count=3,
                workers=4,
                epochs=5,
                sg=0)

In [66]:
model_word2vec.build_vocab(preprocessed_data)

In [70]:
model_word2vec.train(preprocessed_data, total_examples=model.corpus_count, epochs=model.epochs)

(238564833, 264161260)

In [71]:
model_word2vec.wv.most_similar('hello')

[('yeah', 0.7400813102722168),
 ('oh', 0.6740877628326416),
 ('effing', 0.6655706167221069),
 ('um', 0.6565766334533691),
 ('hehe', 0.6532528400421143),
 ('ing', 0.6439530849456787),
 ('damn', 0.6388458013534546),
 ('mmmm', 0.6358838677406311),
 ('hey', 0.6325036883354187),
 ('eff', 0.6320636868476868)]

In [72]:
def avg_vector(tokens):
    vector = []
    for word in tokens:
        if word in model_word2vec.wv:
            vector.append(word)
    if len(vector) > 0:
        return np.mean(model_word2vec.wv[vector], axis=0)
    else:
        return np.zeros(model_word2vec.vector_size, dtype=float)  # fixed-size zero vectorize)

In [73]:
X_vector = np.array([avg_vector(word) for word in preprocessed_data])

In [74]:
y = np.array(data['rating'])

In [75]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(X_vector, y, test_size=0.30, random_state=42)

In [76]:
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [77]:
y_pred = classifier.predict(X_test)

In [85]:
print("Score for word2vec model")
print(f'Accuracy Score : {accuracy_score(y_pred,y_test)}')
print(f'Classification Report : \n{classification_report(y_pred,y_test)}')
print(f'Confusion Matrix : \n{confusion_matrix(y_pred,y_test)}')

Score for word2vec model
Accuracy Score : 0.9507220712606479
Classification Report : 
              precision    recall  f1-score   support

           0       0.30      0.66      0.42      7879
           1       0.99      0.96      0.97    286898

    accuracy                           0.95    294777
   macro avg       0.65      0.81      0.70    294777
weighted avg       0.97      0.95      0.96    294777

Confusion Matrix : 
[[  5183   2696]
 [ 11830 275068]]


In [82]:
# For new data
def predict_new_data(new_data):
    tokens = preprocess(new_data)
    vector = avg_vector(tokens).reshape(1,-1)
    return classifier.predict(vector)[0]

In [84]:
print(predict_new_data("this is the worst i have ever bought"))

0
