## Sentiment Classification of Amazon Product Reviews using SpaCy

In [1]:
import spacy
import en_core_web_md, en_core_web_sm
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from spacy.lang.en import English

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score

### Read in the data and do basic EDA

Read in the Product Review file

In [2]:
df = pd.read_csv('amazon_alexa.tsv', sep='\t')
df.head()

Unnamed: 0,rating,date,variation,verified_reviews,feedback
0,5,31-Jul-18,Charcoal Fabric,Love my Echo!,1
1,5,31-Jul-18,Charcoal Fabric,Loved it!,1
2,4,31-Jul-18,Walnut Finish,"Sometimes while playing a game, you can answer...",1
3,5,31-Jul-18,Charcoal Fabric,I have had a lot of fun with this thing. My 4 ...,1
4,5,31-Jul-18,Charcoal Fabric,Music,1


Check the proportion of positive ('1') and negative ('0') reviews

In [3]:
round(df['feedback'].value_counts(normalize=True) * 100,1)

1    91.8
0     8.2
Name: feedback, dtype: float64

Check the number of rows, columns, any missing data etc.

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3150 entries, 0 to 3149
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   rating            3150 non-null   int64 
 1   date              3150 non-null   object
 2   variation         3150 non-null   object
 3   verified_reviews  3150 non-null   object
 4   feedback          3150 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 123.2+ KB


Drop the columns not required

In [5]:
df = df[['verified_reviews','feedback']]
df.head()

Unnamed: 0,verified_reviews,feedback
0,Love my Echo!,1
1,Loved it!,1
2,"Sometimes while playing a game, you can answer...",1
3,I have had a lot of fun with this thing. My 4 ...,1
4,Music,1


### Text preprocessing using SpaCy

In [6]:
nlp = en_core_web_md.load()

##### Define a tokenizer using SpaCy

In [7]:
def spacy_tokenizer(sentence):
    
    # Convert to lowercase
    sentence = sentence.lower()
    
    # Split into tokens
    tokens = nlp(sentence)

    # Remove Stop-words, punctuations, spaces
    tokens = [word for word in tokens if not word.is_stop and not word.is_space and not word.is_punct]
    
    # Lemmatize
    tokens = [word.lemma_ for word in tokens]
    
    return tokens

Check that the tokenizer is working fine

In [9]:
spacy_tokenizer("This is running fine! It runs incredibly. I would be vouching for it")

['run', 'fine', 'run', 'incredibly', 'vouch']

##### Define a BOW Tf-Idf vectorizer and a Logistic Regrssion classifier

In [10]:
bow_vector = CountVectorizer(tokenizer=spacy_tokenizer, ngram_range=(1,1))
classifier = LogisticRegression(random_state=0)
# classifier = XGBClassifier(random_state=0)

##### Define the data processing pipeline

In [11]:
pipe = Pipeline([("preprocessor", bow_vector),
                 ("classifier",classifier)])

##### Split the data into Training and Test sets

In [12]:
X = df['verified_reviews']
y = df['feedback']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

##### Train the model

In [13]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x0000015629337828>,
                                 vocabulary=None)),
                ('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
   

##### Check the model performance

In [14]:
pred_test = pipe.predict(X_test)

In [15]:
print("Accuracy on test set :", accuracy_score(pred_test, y_test))
print("Precision on test set :", precision_score(pred_test, y_test))
print("Recall on test set :", recall_score(pred_test, y_test))

Accuracy on test set : 0.944973544973545
Precision on test set : 0.9976878612716763
Recall on test set : 0.9452354874041621
