# Libraries

In [72]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.0/en_core_web_sm-2.2.0.tar.gz
import string
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()
#nlp = spacy.load('en_core_web_sm')


from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# 1. Load Datasets

In [2]:
# Load yelp data
data_yelp = pd.read_csv('/Users/krieger/Desktop/NLP/yelp_labelled.txt', sep = '\t', header = None)
data_amazon = pd.read_csv('/Users/krieger/Desktop/NLP/amazon_cells_labelled.txt', sep = '\t', header = None)
data_imdb = pd.read_csv('/Users/krieger/Desktop/NLP/imdb_labelled.txt', sep = '\t', header = None)

# Define the column names 
column_name = ['Review', 'Sentiment']

# Set column names for all three datasets 
data_yelp.columns = column_name
data_amazon.columns = column_name
data_imdb.columns = column_name

In [3]:
data_yelp.shape

(1000, 2)

In [4]:
# Append all datasets into one df
data = data_yelp.append([data_amazon, data_imdb], ignore_index = True)

data.shape

data.sample(6)

Unnamed: 0,Review,Sentiment
964,Del Taco is pretty nasty and should be avoided...,0
1811,#1 It Works - #2 It is Comfortable.,1
2453,"The acting is fantastic, the stories are seaml...",1
583,Their steaks are 100% recommended!,1
1035,You need at least 3 mins to get to your phone ...,0
2366,One of the most disappointing aspects is the l...,0


In [5]:
data['Sentiment'].value_counts()
# 1386 positive records
# 1362 negative records

1    1386
0    1362
Name: Sentiment, dtype: int64

In [6]:
data.isnull().sum()
# There are no NAs

Review       0
Sentiment    0
dtype: int64

In [7]:
# Input Data
X = data['Review']
# Output Data
y = data['Sentiment']

# 2. Data Cleaning

This section will focus on removing stopwords, punctuations as well as apply lemmatization

In [8]:
# import punctuations
punct = string.punctuation
# import stopwords
stopwords = list(STOP_WORDS)

In [22]:
# Create test
sentence = data['Review'][5]
sentence

'Now I am getting angry and I want my damn pho.'

In [23]:
doc = nlp(sentence)

In [52]:
def text_data_cleaning(sentence):
    doc = nlp(sentence)
    
    tokens = []
    for token in doc: 
        if token.lemma_ == '-PRON-': 
            temp = token.lower_
        else:
            temp = token.lemma_.lower().strip()
        tokens.append(temp)

    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_tokens.append(token)
    
    return cleaned_tokens
    


In [53]:
text_data_cleaning(sentence)

['angry', 'want', 'damn', 'pho']

# 3. TFIDF 

In [60]:
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)

In [61]:
classifier = LinearSVC()

# 4. Train the model

In [63]:
# Split the dataset  
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [66]:
X_train.shape, X_test.shape

((2198,), (550,))

In [67]:
X_train.head()

2572    An Italian reviewer called this "a small, grea...
526                          And it was way to expensive.
1509    As an earlier review noted, plug in this charg...
144     Nice blanket of moz over top but i feel like t...
2483    The film gives meaning to the phrase, "Never i...
Name: Review, dtype: object

# 5. Pipeline

In [69]:
# first vectorization and then classifier creation
clf = Pipeline([('tfidf', tfidf), ('clf', classifier)])

In [71]:
clf.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function text_data_cleaning at 0x7fcf8ca760d0>)),
                ('clf', LinearSVC())])

# 6. Predict the results

In [75]:
y_pred = clf.predict(X_test)

In [76]:
confusion_matrix(y_test, y_pred)

array([[203,  76],
       [ 49, 222]])

In [78]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.81      0.73      0.76       279
           1       0.74      0.82      0.78       271

    accuracy                           0.77       550
   macro avg       0.78      0.77      0.77       550
weighted avg       0.78      0.77      0.77       550



In [79]:
accuracy_score(y_test, y_pred)
# 77.2% Accuracy

0.7727272727272727

In [80]:
clf.predict(["Wow, I'm learning Natural language processing in fun fashion!"])

array([1])

In [81]:
clf.predict(["It's hard to learn new things"])

array([0])