In [1]:
import pandas as pd
import spacy
from spacy import displacy

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### Reading the yelp dataset

In [3]:
!gdown --id 1be2yACVr1X_4CKrzogJUTC_WXPT_NBYI

Downloading...
From: https://drive.google.com/uc?id=1be2yACVr1X_4CKrzogJUTC_WXPT_NBYI
To: /content/yelp_labelled.txt
  0% 0.00/61.3k [00:00<?, ?B/s]100% 61.3k/61.3k [00:00<00:00, 4.06MB/s]


In [4]:
df_yelp = pd.read_csv("yelp_labelled.txt", sep ="\t", header = None)

In [5]:
df_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
df_yelp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1000 non-null   object
 1   1       1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [7]:
df_yelp.columns = ["Review", "Sentiment"]

In [8]:
df_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [9]:
df_yelp.shape

(1000, 2)

### Reading the Amazon dataset

In [10]:
!gdown --id 10hVUauQJFcsdKsrRggBsdmsMgl2ICaRf

Downloading...
From: https://drive.google.com/uc?id=10hVUauQJFcsdKsrRggBsdmsMgl2ICaRf
To: /content/amazon_cells_labelled.txt
  0% 0.00/58.2k [00:00<?, ?B/s]100% 58.2k/58.2k [00:00<00:00, 8.00MB/s]


In [11]:
df_amazon = pd.read_csv("amazon_cells_labelled.txt", sep="\t", header= None)

In [12]:
df_amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [13]:
df_amazon.columns = ["Review", "Sentiment"]

In [14]:
df_amazon.head()

Unnamed: 0,Review,Sentiment
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [15]:
df_amazon.shape

(1000, 2)

### Reading the IMDB dataset

In [16]:
!gdown --id 1KhQ2Rgpoc0yRVtZ0Rp-c1eY2R5o2NmuM

Downloading...
From: https://drive.google.com/uc?id=1KhQ2Rgpoc0yRVtZ0Rp-c1eY2R5o2NmuM
To: /content/imdb_labelled.txt
  0% 0.00/85.3k [00:00<?, ?B/s]100% 85.3k/85.3k [00:00<00:00, 5.67MB/s]


In [17]:
df_imdb = pd.read_csv("imdb_labelled.txt", sep = "\t", header= None)

In [18]:
df_imdb.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [19]:
df_imdb.columns = ["Review", "Sentiment"]

In [20]:
df_imdb.head()

Unnamed: 0,Review,Sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [21]:
df_imdb.shape

(748, 2)

### Appending all three dataframes

In [22]:
df_all = df_yelp.append([df_amazon, df_imdb], ignore_index= True)

In [23]:
df_all.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [24]:
df_all.shape

(2748, 2)

In [25]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2748 entries, 0 to 2747
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Review     2748 non-null   object
 1   Sentiment  2748 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 43.1+ KB


In [26]:
df_all["Sentiment"].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [27]:
nlp = spacy.load("en_core_web_sm")

In [28]:
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = list(STOP_WORDS)

In [29]:
sent = nlp.create_pipe('sentencizer')
nlp.add_pipe(sent, before='parser')

In [30]:
import string

In [31]:
punct = string.punctuation
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

### Function to tokenize, lemmatize, and remove stop words and punctuations

In [32]:
def text_cleaning(sentence):
  text = nlp(sentence)
  
  tokens = []
  for tkn in text:
    if tkn.lemma_ != "-PRON-": 
      temp = tkn.lemma_.lower().strip()
    else:
      temp = tkn.lower_
    tokens.append(temp)
  #print(tokens)

  cleaned_tokens = []
  for tkn in tokens:
      if tkn not in stopwords and tkn not in punct:
        cleaned_tokens.append(tkn)
  return(cleaned_tokens)

In [33]:
text_cleaning("I like apples and she likes oranges. What do you like?")

['like', 'apple', 'like', 'orange', 'like']

### Pipeline

In [34]:
from sklearn.svm import LinearSVC

In [35]:
tfidf = TfidfVectorizer(tokenizer= text_cleaning)
classifier = LinearSVC()

In [36]:
X = df_all["Review"]
y = df_all["Sentiment"]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [38]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2198,), (550,), (2198,), (550,))

In [39]:
clf = Pipeline([("tfidf", tfidf), ("clf", classifier)])

In [40]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<function text_cleaning at 0x7f4b7e110ef0>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True

In [41]:
y_pred = clf.predict(X_test)

In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.77      0.82      0.80       285
           1       0.79      0.74      0.77       265

    accuracy                           0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550



In [43]:
print(confusion_matrix(y_test, y_pred))

[[233  52]
 [ 68 197]]


### Testing with some random samples

In [44]:
clf.predict(["Great food"])

array([1])

In [45]:
clf.predict(["The movie is a waste of time"])

array([0])

In [46]:
clf.predict(["Don't eat here."])

array([0])

In [47]:
clf.predict(["It's meh :("])

array([0])

In [48]:
clf.predict(["The food is awesome but the service is very slow and the servers are rude"])

array([0])

In [50]:
clf.predict(["It's fun"])

array([1])

In [49]:
clf.predict(["I enjoyed the movie but my dog found it boring"])

array([1])