## 1. Text Processing using SpaCy

### 1.1 Lemmatization

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("run ran running")

for token in doc:
    print(token.text, token.lemma_)
    
# to NOT confuse the model, we want to convert words to their lemma
# for very powerful neural network like Transformer (huggingface), NO NEED TO LEMMATIZATION, bc they understand



run run
ran run
running run


### 1.2 Stopwords

In [2]:
from spacy.lang.en.stop_words import STOP_WORDS

stopwords = list(STOP_WORDS)
print(stopwords[:5])

['call', 'part', 'name', 'due', 'not']


In [3]:
# let's demonstrate how to remove stopword
doc = nlp("Chaky is going to eat at Thammasat with his best friend Peter.")

In [4]:
clean_tokens = []

for token in doc:
    if token.text not in stopwords:
        clean_tokens.append(token.text)
        
clean_tokens

['Chaky', 'going', 'eat', 'Thammasat', 'best', 'friend', 'Peter', '.']

In [5]:
doc = nlp("The movie should have been good.")

clean_tokens = []

for token in doc:
    if token.text not in stopwords:
        clean_tokens.append(token.text)
        
clean_tokens  # not good

['The', 'movie', 'good', '.']

### 1.3 Removing punctuation

In [6]:
# removing punctuation
doc = nlp("Chaky, the teacher $  /   @ # at AIT,!!!???? likes to eat naan.")

In [7]:
token_no_punct = []

for token in doc:
    if token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and token.pos_ != 'SYM':
        token_no_punct.append(token.text)

In [8]:
token_no_punct

['Chaky',
 'the',
 'teacher',
 '@',
 '#',
 'at',
 'AIT',
 'likes',
 'to',
 'eat',
 'naan']

### 1.4 Lowercasing and Unnecessary Spaces

In [9]:
stripped_lowercase_tokens = []

for token in doc:
    stripped_lowercase_tokens.append(token.text.lower().strip())
    
stripped_lowercase_tokens

['chaky',
 ',',
 'the',
 'teacher',
 '$',
 '',
 '/',
 '',
 '@',
 '#',
 'at',
 'ait',
 ',',
 '!',
 '!',
 '!',
 '?',
 '?',
 '?',
 '?',
 'likes',
 'to',
 'eat',
 'naan',
 '.']

### 1.5 Combine Everything

In [10]:
# nowadays, we don't preprocess anymore, especially for big models, because you lose a lot of information
# if there is something you can clean, is extra spaces or like duplicate symbols.....

# if we use ML, e.g., SVM, KNN, RF, we need to preprocess
def preprocessing(sentence):
    
    stopwords = list(STOP_WORDS)
    doc = nlp(sentence)
    cleaned_tokens = []
    
    for token in doc:
        if token.text not in stopwords and token.pos_ != 'PUNCT' and token.pos_ != 'SPACE' and \
            token.pos_ != 'SYM':
                cleaned_tokens.append(token.text)
                
    return cleaned_tokens

## 2. Sentiment Analysis with Sklearn and SpaCy!!!

In [11]:
# importing stuffs
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

### 2.1 Loading Data

In [12]:
data_yelp   = pd.read_csv('/content/yelp_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])
data_amazon = pd.read_csv('/content/amazon_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])
data_imdb = pd.read_csv('/content/imdb_labelled.txt', sep='\t', header = None, names = ['Review', 'Sentiment'])

In [14]:
data_yelp.head()

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [15]:
data_yelp.shape, data_amazon.shape, data_imdb.shape

((1000, 2), (1000, 2), (748, 2))

### 2.2 EDA

In [16]:
data = pd.concat([data_yelp, data_amazon, data_imdb], ignore_index=True)
data.shape

(2748, 2)

In [17]:
data['Sentiment'].value_counts()

1    1386
0    1362
Name: Sentiment, dtype: int64

In [18]:
data.isna().sum()

Review       0
Sentiment    0
dtype: int64

### Countvectorizer

In [19]:
# counting the frequency of words in postive and negative samples
# CountVectorizer

from sklearn.feature_extraction.text import CountVectorizer

countvec = CountVectorizer(tokenizer = preprocessing)

# let's try
corpus = [
    'Chaky is coding python     ',
    'Deep learning is very deep',
    'Are you sure about this?????',
    'please hashtag #ilovepython'
]
result   = countvec.fit_transform(corpus)

# list of tokens
print(countvec.get_feature_names_out())

# count
# rows are sentences
# columns are
print(result.toarray())

['chaky' 'coding' 'deep' 'hashtag' 'ilovepython' 'learning' 'python'
 'sure']
[[1 1 0 0 0 0 1 0]
 [0 0 2 0 0 1 0 0]
 [0 0 0 0 0 0 0 1]
 [0 0 0 1 1 0 0 0]]


In [20]:
# let's look at top words categorized by postive and negative
import numpy as np

neg_cond = data.Sentiment == 0
pos_cond = data.Sentiment == 1

neg_df   = data[neg_cond]
pos_df   = data[pos_cond]

In [21]:
# count
neg_result = countvec.fit_transform(neg_df.Review)
neg_vocabs = countvec.get_feature_names_out()

pos_result = countvec.fit_transform(pos_df.Review)
pos_vocabs = countvec.get_feature_names_out()

In [22]:
neg_result.shape

(1362, 3155)

In [23]:
# sum the counts
neg_counts = np.sum(neg_result, axis=0)
pos_counts = np.sum(pos_result, axis=0)

In [24]:
# data frame
df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0, ascending=False)

In [25]:
df.head(10)

Unnamed: 0,0
1,103
bad,96
movie,95
0,92
phone,78
film,72
like,67
food,66
time,62
good,57


In [26]:
df = pd.DataFrame(pos_counts, columns = pos_vocabs).T.sort_values(by=0, ascending=False)
df.head(10)

Unnamed: 0,0
great,192
good,171
film,91
movie,87
phone,87
food,60
best,59
place,58
like,58
service,55


### TfidfVectorizer

In [27]:
tfidvec = TfidfVectorizer(tokenizer=preprocessing)

# count
neg_result   = tfidvec.fit_transform(neg_df.Review)
neg_vocabs   = tfidvec.get_feature_names_out()
pos_result   = tfidvec.fit_transform(pos_df.Review)
pos_vocabs   = tfidvec.get_feature_names_out()

# sum words across all documents
neg_counts = np.sum(neg_result, axis=0)
pos_counts = np.sum(pos_result, axis=0)

print(neg_counts.shape, pos_counts.shape)
print(neg_vocabs.shape, pos_vocabs.shape)

(1, 3155) (1, 3116)
(3155,) (3116,)


In [28]:
# top ten negative terms
df = pd.DataFrame(neg_counts, columns = neg_vocabs).T.sort_values(by=0, ascending=False)
df.head(10)

Unnamed: 0,0
bad,27.352373
phone,21.732597
service,21.427283
food,20.67629
movie,18.446709
time,18.194959
place,17.815239
good,16.930471
like,16.75698
waste,15.609785


In [29]:
# top ten positive terms
df = pd.DataFrame(pos_counts, columns = pos_vocabs).T.sort_values(by=0, ascending=False)
df.head(10)

Unnamed: 0,0
great,56.705259
good,47.769443
phone,30.258919
food,22.290479
place,21.910917
service,21.79469
works,21.240647
film,20.164956
movie,19.952672
excellent,19.037116


## 3. Modeling and Training

In [30]:
from sklearn.svm import LinearSVC  # here i am using machine learning, NOT deep learning

# defining model
classifier = LinearSVC()
tfidvec    = TfidfVectorizer()

# defining X and y
X = data['Review']
y = data['Sentiment']

# splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state=42)
print(X_train.shape)

(1923,)


In [31]:
# making pipeline
clf = Pipeline([('tfidf', tfidvec),  ('clf', classifier)])

In [32]:
# #pipeline is the same as:

# X_train_transformed = tfidvec.fit_transform(X_train)
# X_train_transformed.shape  #(words, features)
# classifier.fit(X_train_transformed, y_train)

In [33]:
# training
clf.fit(X_train, y_train)

# predicting
yhat = clf.predict(X_test)

# evaluating metrics
print(classification_report(yhat, y_test))

              precision    recall  f1-score   support

           0       0.82      0.84      0.83       426
           1       0.83      0.81      0.82       399

    accuracy                           0.82       825
   macro avg       0.82      0.82      0.82       825
weighted avg       0.82      0.82      0.82       825



In [34]:
#confusion matrix
confusion_matrix(yhat, y_test)

array([[358,  68],
       [ 77, 322]])

## 4. Real-world

In [35]:
clf.predict(['Chaky loves to eat sushi.'])

array([0])

In [36]:
clf.predict(['This movie is good.'])

array([1])

In [37]:
clf.predict(['This movie should have been good.'])
# double negative is a very good test !!!!
# do remember the sentiment tree bank

array([1])

In [38]:
clf.predict(['This movie is crazily amazing.'])

array([1])

In [39]:
clf.predict(['This bad movie is good.'])

array([0])