# SENTIMENT ANALYSIS with AMAZON & IMDB data

In [72]:
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix
from spacy import displacy  #for vidualizing
import os

In [14]:
nlp = spacy.load('en_core_web_sm')  #Loding English model

In [73]:
stopwords = list(STOP_WORDS)

### yelp_data

In [27]:
data_yelp = pd.read_csv(r"F:\Bobby\DATA SET\NLP\yelp_labelled.txt",sep = '\t' , header = None)

- use tab as a seprator bcz that it is not a csv file
- header = None bcz there is no header

In [28]:
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [29]:
columns_name = ['Review','Sentiment']
data_yelp.columns = columns_name

- name the columns

In [30]:
data_yelp.columns  #now we have columns

Index(['Review', 'Sentiment'], dtype='object')

In [31]:
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [32]:
data_yelp.shape

(1000, 2)

### Amazon_data

In [37]:
data_amazon =  pd.read_csv(r"F:\Bobby\DATA SET\NLP\amazon_cells_labelled.txt",sep = '\t',header = None)

In [38]:
data_amazon.columns = columns_name

In [39]:
data_amazon.columns

Index(['Review', 'Sentiment'], dtype='object')

In [40]:
data_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


### IMDB Data

In [41]:
data_imdb =  pd.read_csv(r"F:\Bobby\DATA SET\NLP\imdb_labelled.txt",sep = '\t',header = None)

In [42]:
data_imdb.columns = columns_name

In [43]:
data_imdb

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
...,...,...
743,I just got bored watching Jessice Lange take h...,0
744,"Unfortunately, any virtue in this film's produ...",0
745,"In a word, it is embarrassing.",0
746,Exceptionally bad!,0


In [45]:
data_imdb.shape

(748, 2)

##### we will combined all the data of all 3 companies

In [49]:
data = data_imdb.append([data_amazon,data_yelp])

In [50]:
data.shape

(2748, 2)

In [52]:
data.tail()

Unnamed: 0,Review,Sentiment
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0
999,"Then, as if I hadn't wasted enough of my life ...",0


In [57]:
data.Sentiment.nunique()   # we have only 2 unique values 0 and 1

2

##### - counting the Sentiment Values that how much positive values and how much negative

In [58]:
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

###### - checking the null or missing values

In [65]:
data.isnull().sum()  # there is no null values

Review       0
Sentiment    0
dtype: int64

###### Finding punctuation

In [66]:
import string

In [67]:
punct = string.punctuation

In [68]:
punct  

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

##### DATA CLEANING STEPS - 

 - 1 - Convert doc into NLP doc 
 - 2 - if tokens base word (Lemma) is not a pronoun convert that in lower case and strip and pass to temp
 - - - Else only covert in lower case and pass to temp 
 - 3 - append all the temp in and empty list tokens
 - - - 
 - 4 - add all tokens one by one into cleaned tokens list , only if that is neither stop word nor Punctuation

In [86]:
def data_cleaning(sentence):
    doc = nlp(sentence) #convert sentence into nlp doc
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip()   #strip extra charecter
        else: 
            temp = token.lower_
        tokens.append(temp)
        
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    return cleaned_tokens

###### Exapmle if i pass a sentence here....

In [88]:
data_cleaning("Hello how are you. Love to Learn")

['hello', 'love', 'learn']

### Vectorization Feature Engineering(TF-IDF)

In [89]:
from sklearn.svm import LinearSVC

In [94]:
tfidf = TfidfVectorizer(tokenizer = data_cleaning)  #vectorised Clean data tokens

In [95]:
classifier = LinearSVC()

In [99]:
X = data['Review']
y = data['Sentiment']

In [100]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2 , random_state = 42)

In [102]:
X_train.shape

(2198,)

In [103]:
X_test.shape

(550,)

In [104]:
clf =  Pipeline([('tfidf',tfidf),('clf',classifier)])

In [105]:
clf.fit(X_train,y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function data_cleaning at 0x0000020420873288>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=

In [106]:
y_pred = clf.predict(X_test)

In [108]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79       286
           1       0.77      0.77      0.77       264

    accuracy                           0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550



In [109]:
clf.predict(['Wow, SpaCy is amazing'])

array([1], dtype=int64)

In [110]:
clf.predict(['Wow, Java sucks'])

array([0], dtype=int64)

In [112]:
accuracy_score(y_test,y_pred)

0.7781818181818182

 - - Our Model is 77% Accurate

# 77%