In [15]:
import spacy
import re
import pandas as pd
from string import punctuation

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn import pipeline
from sklearn.metrics import confusion_matrix,classification_report

In [4]:
nlp = spacy.load("en_core_web_sm")

### Data Cleaning

In [14]:
# Strip
# Remove New line character
# Apply lower to text
# Tokenize
# Remove Stop words
# Remove punctuations (Use spacy is_punct. in NLTK , we can use regex subsitute re.sub('[^A-Za-z]',' ',review) before tokenization or use string.punctuation)
# Get Lemma

In [48]:
def clean_review(review):
    review=review.strip()
    review=re.sub('\n',' ',review)
    review=review.lower() # lower case
    review=nlp(review)
    review=[token for token in review if token.is_stop==False]  # remove stop words
    review=[token for token in review if token.is_punct==False]
    review=[token.lemma_ if token.lemma_ != "-PRON-" else token for token in review]
    return review
    

In [7]:
df=pd.read_csv('Restaurant_Reviews.tsv',delimiter='\t')

In [12]:
X,y=df['Review'],df['Liked']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.25,)

In [55]:
tfidf=TfidfVectorizer(tokenizer=clean_review)

In [53]:
svc=LinearSVC()

In [56]:
clf=pipeline.make_pipeline(tfidf,svc)

In [57]:
clf.fit(X_train,y_train)

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(tokenizer=<function clean_review at 0x12b5dd0d0>)),
                ('linearsvc', LinearSVC())])

In [58]:
clf.score(X_test,y_test)

0.744

In [64]:
pd.DataFrame(confusion_matrix(y_test,clf.predict(X_test)),index=['Actual-0','Actual-1'],columns=['Prediction-0','Prediction-1'])

Unnamed: 0,Prediction-0,Prediction-1
Actual-0,86,28
Actual-1,36,100


In [66]:
print(classification_report(y_test,clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.70      0.75      0.73       114
           1       0.78      0.74      0.76       136

    accuracy                           0.74       250
   macro avg       0.74      0.74      0.74       250
weighted avg       0.75      0.74      0.74       250



## Test your own review

In [82]:
sample="""
Fit was slightly small, strap was abrasive feeling. Sandals looked good online but looked and felt cheap in person. I’m going back to Olukai...
"""

In [83]:
print('Cleaned Text:\n',clean_review(sample))
print('\n')
print('Review was ','POSITIVE' if clf.predict([sample])==[1] else 'NEGATIVE')

Cleaned Text:
 ['fit', 'slightly', 'small', 'strap', 'abrasive', 'feeling', 'sandal', 'look', 'good', 'online', 'look', 'feel', 'cheap', 'person', 'go', 'olukai']


Review was  NEGATIVE
