In [2]:
import pandas as pd 

data = pd.read_csv(r"C:\Users\AMITESH\Python_code\Company_Projects\dataset\Reviews.csv") 
data.head()

Unnamed: 0,Review,feedback
0,Wow... Loved this place.,like
1,Crust is not good.,dislike
2,Not tasty and the texture was just nasty.,dislike
3,Stopped by during the late May bank holiday of...,like
4,The selection on the menu was great and so wer...,like


In [3]:
data['feedback'].value_counts()

like       501
dislike    499
Name: feedback, dtype: int64

#Data seems to be balanced dataset 

# Data cleaning 

In [6]:
import spacy
import re
from spacy.lang.en.stop_words import STOP_WORDS as stopwords

stopwords.remove('not')
data['Review'] = data['Review'].apply(lambda x:' '.join([t for t in x.split() if t not in set(stopwords)]))
data['Review'] = data['Review'].apply(lambda x : re.sub(r'[^\w]', " ", x))
data

Unnamed: 0,Review,feedback
0,Wow Loved place,like
1,Crust not good,dislike
2,Not tasty texture nasty,dislike
3,Stopped late May bank holiday Rick Steve recom...,like
4,The selection menu great prices,like
...,...,...
995,I think food flavor texture lacking,dislike
996,Appetite instantly gone,dislike
997,Overall I not impressed not,dislike
998,The experience underwhelming I think ll Ninja ...,dislike


# Word2Vec

In [7]:
import spacy 
nlp = spacy.load('en_core_web_lg')   # loading en_core_web_lg trained pipeline 

In [9]:
def get_vec(x):
    doc = nlp(x)
    vec = doc.vector
    return vec

In [12]:
data['vec'] = data['Review'].apply(lambda x: get_vec(x))
data

Unnamed: 0,Review,feedback,vec
0,Wow Loved place,like,"[0.047326665, 0.28252366, -0.20889999, -0.1025..."
1,Crust not good,dislike,"[-0.15670057, 0.10054334, -0.23732133, -0.0582..."
2,Not tasty texture nasty,dislike,"[-0.3673005, -0.102475, -0.047201503, -0.08875..."
3,Stopped late May bank holiday Rick Steve recom...,like,"[-0.05520178, 0.23588835, -0.12829132, -0.1005..."
4,The selection menu great prices,like,"[-0.07097539, 0.156196, 0.23880783, -0.0426675..."
...,...,...,...
995,I think food flavor texture lacking,dislike,"[-0.13820933, 0.23190516, -0.01509251, -0.2692..."
996,Appetite instantly gone,dislike,"[-0.10598, 0.18677, -0.03493534, -0.06977666, ..."
997,Overall I not impressed not,dislike,"[-0.070323996, 0.3305, -0.24875002, -0.1819105..."
998,The experience underwhelming I think ll Ninja ...,dislike,"[0.039792, -0.044649985, -0.04275078, -0.18238..."


# Splitting the dataset 

In [13]:
data.shape

(1000, 3)

In [14]:
X = data['vec'].to_numpy()
X = X.reshape(-1, 1)

In [15]:
X.shape

(1000, 1)

In [16]:
import numpy as np 

X = np.concatenate(np.concatenate(X, axis = 0), axis = 0).reshape(-1,300)
X.shape

(1000, 300)

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split( X, data['feedback'], 
                                                    test_size = 0.2, random_state = 0)

# Logistic regression 

In [42]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C = 1.0)
clf.fit(X_train, Y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [43]:
Y_predict = clf.predict(X_test)

# Validation Process 

In [44]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

print(classification_report(Y_test, Y_predict))

              precision    recall  f1-score   support

     dislike       0.84      0.82      0.83        97
        like       0.84      0.85      0.85       103

    accuracy                           0.84       200
   macro avg       0.84      0.84      0.84       200
weighted avg       0.84      0.84      0.84       200



In [45]:
confusion_matrix(Y_predict, Y_test)

array([[80, 15],
       [17, 88]], dtype=int64)

#80 people were found to have positive feedback and model predicted it correctly 
#88 people were found to have negative feedback and model predicted it correctly 
#15 people were found to have negative feedback but model predicted it has positive feedback 
#17 people were found to have positive feedback but model predicted it has negative feedback 

In [46]:
print("Accuracy of logistic regression is = ", accuracy_score(Y_predict, Y_test)*100,"%")

Accuracy of logistic regression is =  84.0 %
