# Logistic Regression

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd

### Loading Data

In [2]:
dataset = pd.read_csv('Sentiments.csv', delimiter='\t')

### Exploring Data

In [3]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Labels   1000 non-null   int64 
 1   Reviews  1000 non-null   object
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [4]:
dataset.Labels.unique()

array([1, 0])

In [5]:
dataset.Labels.value_counts()

1    500
0    500
Name: Labels, dtype: int64

So this is a balanced dataset. 

In [6]:
dataset[:10]

Unnamed: 0,Labels,Reviews
0,1,Wow... Loved this place.
1,0,Crust is not good.
2,0,Not tasty and the texture was just nasty.
3,1,Stopped by during the late May bank holiday of...
4,1,The selection on the menu was great and so wer...
5,0,Now I am getting angry and I want my damn pho.
6,0,Honeslty it didn't taste THAT fresh.)
7,0,The potatoes were like rubber and you could te...
8,1,The fries were great too.
9,1,A great touch.


In [7]:
all_reviews = ' '.join(dataset.Reviews)
from nltk import FreqDist
from nltk.tokenize import word_tokenize
words = word_tokenize(all_reviews.lower())

In [8]:
Word_by_Freq = FreqDist(words)
len(Word_by_Freq)

2081

In [9]:
Word_by_Freq.most_common(25)

[('.', 821),
 ('the', 585),
 ('and', 392),
 (',', 366),
 ('i', 356),
 ('was', 308),
 ('!', 251),
 ('a', 237),
 ('to', 219),
 ('is', 174),
 ('it', 153),
 ('this', 143),
 ('of', 127),
 ('food', 125),
 ('not', 118),
 ('for', 110),
 ('in', 107),
 ('place', 106),
 ("n't", 96),
 ('good', 95),
 ('we', 88),
 ('service', 83),
 ('very', 76),
 ('my', 72),
 ('with', 72)]

### Preprocessing Data 

In [10]:
import re
from tqdm import tqdm
from nltk.corpus import stopwords

In [11]:
stop_words = stopwords.words('english')
stop_words.remove("not")
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [12]:
processed_reviews = []
for review in tqdm(dataset.Reviews):
    tokenize_review = re.sub('[^a-zA-Z]',' ',review.lower())
    tokenize_review = [word for word in word_tokenize(tokenize_review) 
                       if (word not in stop_words and word.isalpha())
                      ]
    processed_reviews.append(' '.join(tokenize_review))

100%|██████████| 1000/1000 [00:00<00:00, 6290.80it/s]


In [13]:
processed_reviews

['wow loved place',
 'crust not good',
 'not tasty texture nasty',
 'stopped late may bank holiday rick steve recommendation loved',
 'selection menu great prices',
 'getting angry want damn pho',
 'honeslty taste fresh',
 'potatoes like rubber could tell made ahead time kept warmer',
 'fries great',
 'great touch',
 'service prompt',
 'would not go back',
 'cashier care ever say still ended wayyy overpriced',
 'tried cape cod ravoli chicken cranberry mmmm',
 'disgusted pretty sure human hair',
 'shocked signs indicate cash',
 'highly recommended',
 'waitress little slow service',
 'place not worth time let alone vegas',
 'not like',
 'burrittos blah',
 'food amazing',
 'service also cute',
 'could care less interior beautiful',
 'performed',
 'right red velvet cake ohhh stuff good',
 'never brought salad asked',
 'hole wall great mexican street tacos friendly staff',
 'took hour get food tables restaurant food luke warm sever running around like totally overwhelmed',
 'worst salmon sa

### Model Creation

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
reviews = vectorizer.fit_transform(processed_reviews).toarray()
reviews = pd.DataFrame(reviews)

In [15]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(reviews, dataset.Labels, train_size=0.8)

In [16]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

### Model Prediction and Evaluation

In [17]:
y_pred_lr = lr.predict(x_test)

In [18]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred_lr))

[[86 14]
 [24 76]]


In [19]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred_lr))

0.81


### Predicting Review

In [20]:
text = "Food was good and service was fast"
text = text.lower()
vect_text = vectorizer.transform([text]).toarray()
lr.predict(vect_text)

array([1])

In [21]:
text = "Bad service"
text = text.lower()
vect_text = vectorizer.transform([text]).toarray()
lr.predict(vect_text)

array([0])

### Exploring Model

In [22]:
text = "Food was not good"
text = text.lower()
vect_text = vectorizer.transform([text]).toarray()
lr.predict(vect_text)

array([0])

here, we can see that the word not is removed when filtering from stopwords. Thus after preprocessing text:"food good"<br>
which is not the review that customer has provided.
To counter this, we can create customized set of stopwords.

### Following things we can infer from the module
    1. Even with the small dataset we created a model with 77% accuracy which is not very good but decent.
    2. Removing the negation word impact the review so carefully while creating stopwords dataset