# Importing Libraries and Dataset

In [40]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

In [41]:
data_amazon = pd.read_csv(r'D:\Machine learning\Amazon\amazon_cells_labelled.txt', sep ='\t',header = None)


In [42]:
data_amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [43]:
data_amazon.shape


(1000, 2)

In [44]:
column_name = ['Review','Sentiment']
data_amazon.columns = column_name

In [45]:
data_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [46]:
data_amazon['Sentiment'].value_counts()

0    500
1    500
Name: Sentiment, dtype: int64

So it is evenly distributed which makes it easier for us to analyse

In [47]:
data_amazon.isnull().sum()

Review       0
Sentiment    0
dtype: int64

There is no missing records. Let's seggrigate our input and output dataset

In [48]:
x = data_amazon['Review']
y = data_amazon['Sentiment']

# Data Cleaning 

In the data cleaning, we are going to remove the stopwords and we will apply lemmatization 

In [49]:
import string

In [50]:
punc = string.punctuation

In [51]:
punc

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [52]:
import spacy

In [53]:
nlp = spacy.load("en_core_web_sm")

In [54]:
from spacy.lang.en.stop_words import STOP_WORDS

In [65]:
stopword = list(STOP_WORDS)

In [66]:
stopword

['on',
 'against',
 'herein',
 'mine',
 'sometimes',
 'each',
 'noone',
 'will',
 'never',
 'eleven',
 'behind',
 'could',
 'from',
 'them',
 'give',
 'here',
 "'m",
 'than',
 'six',
 'whence',
 'too',
 'whoever',
 'yet',
 'ourselves',
 'keep',
 'why',
 'seeming',
 'same',
 'doing',
 'so',
 'as',
 'go',
 'namely',
 'there',
 'no',
 'whole',
 'several',
 'what',
 'get',
 'most',
 'already',
 'very',
 "'re",
 'seemed',
 'sixty',
 '‘re',
 'further',
 'just',
 'until',
 'moreover',
 'anything',
 'least',
 'another',
 "'s",
 'show',
 'other',
 'thru',
 'am',
 'becoming',
 'even',
 '‘m',
 'your',
 'elsewhere',
 'next',
 '’d',
 'have',
 '‘s',
 'one',
 'more',
 'down',
 'therein',
 'or',
 'indeed',
 'been',
 'whereas',
 'used',
 'it',
 'were',
 'a',
 'beyond',
 'still',
 'while',
 'hundred',
 'sometime',
 'now',
 'might',
 'n’t',
 'without',
 'neither',
 'part',
 'per',
 'put',
 'and',
 'whereafter',
 'who',
 '’ve',
 'hereupon',
 'these',
 'four',
 'across',
 'under',
 'nobody',
 'perhaps',
 '

Create a function to clean data 

In [75]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    
    tokens = []
    for token in doc:
        if token.lemma != '-PRON-':
            temp = token.lemma_.lower().strip()
        else:
            temp = token.lower_
        tokens.append(temp)
        
    cleaned_tokens = []
    for token in tokens:
        if token not in stopword and token not in punc:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [76]:
text_data_cleaning('Hi buudy')

['hi', 'buudy']

# Vectorization Feature Engineering 

In [79]:
from sklearn.svm import LinearSVC

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [82]:
from sklearn.pipeline import Pipeline

In [83]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning )

In [84]:
classifier = LinearSVC()

# Training the model

In [87]:
from sklearn.model_selection import train_test_split

In [89]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0 )

In [90]:
x_train.shape, x_test.shape

((800,), (200,))

In [91]:
x_train.head()

687    i would advise to not purchase this item it ne...
500    The bose noise cancelling is amazing, which is...
332                          The reception is excellent!
979                                   Not enough volume.
817       Gets a signal when other Verizon phones won't.
Name: Review, dtype: object

In [92]:
clf = Pipeline([('tfidf',tfidf), ('clf',classifier)])

In [93]:
clf.fit(x_train,y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x0000024A7846FAF0>)),
                ('clf', LinearSVC())])

# Predicting result

In [95]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [96]:
y_pred = clf.predict(x_test)

In [97]:
confusion_matrix(y_test,y_pred)

array([[75, 21],
       [25, 79]], dtype=int64)

In [98]:
accuracy_score(y_test,y_pred)

0.77

In [99]:
clf.predict(['Wow, This is an Amazing product'])

array([1], dtype=int64)

In [100]:
clf.predict(['Worst product ever'])

array([0], dtype=int64)