In [30]:
# Importing libraries
import spacy
from spacy import displacy

In [31]:
# import small english library
nlp = spacy.load("en_core_web_sm")

In [32]:
# import text that we want to tokenize
text = "Apple is a Great company. Google is a competitor of Apple. Amazon is the richest company."

In [33]:
doc = nlp(text)

In [34]:
doc

Apple is a Great company. Google is a competitor of Apple. Amazon is the richest company.

- When we call doc now, we do not see any changes. But doc holds alot of information in it. Like tokenization, Lammatization, parsing, tagging etc.

In [35]:
# print the tokens

for token in doc:
    print(token)

Apple
is
a
Great
company
.
Google
is
a
competitor
of
Apple
.
Amazon
is
the
richest
company
.


In [36]:
# Breaking the doc into sentences

sent = nlp.create_pipe("sentencizer") # creating a sentencizer

In [37]:
# Adding sentencizer to pipe before parser because, parsing divides the sentences into words and hence we cannot sentencize the words
nlp.add_pipe(sent, before = "parser")

In [38]:
doc = nlp(text)

In [39]:
for sent in doc.sents:
    print(sent)

Apple is a Great company.
Google is a competitor of Apple.
Amazon is the richest company.


In [40]:
# Fetch the stopwords

from spacy.lang.en.stop_words import STOP_WORDS 

In [41]:
stopwords = list(STOP_WORDS)

In [42]:
print(stopwords)

['therefore', 'somehow', 'meanwhile', 'hence', 'becoming', 'when', 'ten', 'beyond', 'his', 'please', 'ours', '‘ve', 'all', 'some', 'both', '’re', "'m", 'fifteen', 'herself', 'often', 'formerly', 'few', 'always', 'can', 'among', 'mostly', 'many', 'first', 'in', 'she', 'four', 'would', 'move', 'get', 'unless', 'whence', '‘ll', 'except', 'every', 'indeed', "'re", 'has', 'seem', 'whither', 'thus', 'hereupon', 'during', 'yourself', 'least', 'the', 'next', 'cannot', 'did', 'hers', 'whose', 'everyone', 'under', 'full', 'alone', 'such', 'made', 'each', 'afterwards', 'others', 'however', 'see', 'could', 'after', 'what', 'anyway', 'latterly', 'any', 'too', 'herein', 'them', 'myself', 'should', 'onto', 'here', 'hereby', 'thereupon', 'therein', 'neither', 'we', 'nine', 'while', 'five', 'twenty', 'thence', 'although', 'whereby', 'may', 'top', 'within', 'seemed', 'nothing', 'whereafter', 'became', 'once', 'part', 'being', 'up', 'whereupon', 'not', 'out', 'there', 'you', 'something', 'empty', 'throug

In [43]:
len(stopwords)

326

In [44]:
# Removing stopwords from the sentences

for token in doc:
    if token.is_stop == False:
        print(token)

Apple
Great
company
.
Google
competitor
Apple
.
Amazon
richest
company
.


#### The reason we remove stop words is as follows:

1. It does not provide much information about the text.
2. The BOW matrix is compressed after removal of stopwords as unnecessay words are removed.
3. Hence we get meaningful bag of words vector which helps in improving the accuracy.

### Lemmatization

In [46]:
doc = nlp("run runs running runner") # Eg for lemmatization

In [48]:
for lem in doc:
    print(lem.text, lem.lemma_)

run run
runs run
running run
runner runner


### POS tagging

In [49]:
doc = nlp("ALL is Well at your end!")

In [50]:
for token in doc:
    print(token.text, token.pos_)

ALL DET
is AUX
Well ADV
at ADP
your PRON
end NOUN
! PUNCT


### Displaying Dependency using displacy

In [51]:
displacy.render(doc, style = "dep") # The labels on the arrow show us how the words are depended on each other

### Entity detection

In [55]:
doc = nlp("Historically, cricket's origins are uncertain and the earliest definite reference is in south-east England in the middle of the 16th century. It spread globally with the expansion of the British Empire, leading to the first international matches in the second half of the 19th century. The game's governing body is the International Cricket Council (ICC), which has over 100 members, twelve of which are full members who play Test matches. The game's rules are held in a code called the Laws of Cricket which is owned and maintained by Marylebone Cricket Club (MCC) in London. The sport is followed primarily in the Indian subcontinent, Australasia, the United Kingdom, southern Africa and the West Indies, its globalisation occurring during the expansion of the British Empire and remaining popular into the 21st century.[1] Women's cricket, which is organised and played separately, has also achieved international standard. The most successful side playing international cricket is Australia, which has won seven One Day International trophies, including five World Cups, more than any other country and has been the top-rated Test side more than any other country. Virat Kohli is the captain of India. Sachin Tendulkar is the God of Cricket.")

In [56]:
doc

Historically, cricket's origins are uncertain and the earliest definite reference is in south-east England in the middle of the 16th century. It spread globally with the expansion of the British Empire, leading to the first international matches in the second half of the 19th century. The game's governing body is the International Cricket Council (ICC), which has over 100 members, twelve of which are full members who play Test matches. The game's rules are held in a code called the Laws of Cricket which is owned and maintained by Marylebone Cricket Club (MCC) in London. The sport is followed primarily in the Indian subcontinent, Australasia, the United Kingdom, southern Africa and the West Indies, its globalisation occurring during the expansion of the British Empire and remaining popular into the 21st century.[1] Women's cricket, which is organised and played separately, has also achieved international standard. The most successful side playing international cricket is Australia, whic

In [57]:
displacy.render(doc, style = "ent")

- Cardinal means number

### Text Classification

In [59]:
import pandas as pd

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [64]:
data_yelp = pd.read_csv("C:/Users/Ashish/Desktop/Python Projects/Amazon and IMDB sentiment/yelp_labelled.txt", sep = "\t", header = None)

In [65]:
data_yelp.head()

Unnamed: 0,0,1
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [66]:
columns_name = ["Reviews", "Sentiments"]

In [67]:
data_yelp.columns = columns_name

In [68]:
data_yelp.head()

Unnamed: 0,Reviews,Sentiments
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [69]:
data_yelp.shape

(1000, 2)

- The dataset has 1000 reviews and 2 columns ie sentiments and Reviews

In [70]:
# Getting the amazon reviews data

data_amazon = pd.read_csv("C:/Users/Ashish/Desktop/Python Projects/Amazon and IMDB sentiment/amazon_cells_labelled.txt", sep = "\t", header = None)

In [71]:
data_amazon.head()

Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [72]:
data_amazon.columns = columns_name

In [73]:
data_amazon.head()

Unnamed: 0,Reviews,Sentiments
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


In [74]:
data_amazon.shape

(1000, 2)

In [75]:
data_imdb = pd.read_csv("C:/Users/Ashish/Desktop/Python Projects/Amazon and IMDB sentiment/imdb_labelled.txt", sep = "\t", header = None)

In [76]:
data_imdb.columns = columns_name

In [77]:
data_imdb.head()

Unnamed: 0,Reviews,Sentiments
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [78]:
data_imdb.shape

(748, 2)

- We have 3 datasets in total with reviews and sentiments. We will append all the three datasets together

In [79]:
data = data_yelp.append([data_amazon, data_imdb], ignore_index= True)

In [80]:
data.shape

(2748, 2)

In [81]:
data.head()

Unnamed: 0,Reviews,Sentiments
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [82]:
# Check the distribution of sentiment

data["Sentiments"].value_counts()

1    1386
0    1362
Name: Sentiments, dtype: int64

- There 1386 positive reviews and 1342 negative reviews

In [83]:
# Check if there are any null values

data.isnull().sum()

Reviews       0
Sentiments    0
dtype: int64

- There are no null values in any columns

### Tokenization

In [84]:
import string

In [85]:
punct = string.punctuation

In [86]:
punct

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

#### Creating a function for data cleaning

- We check if the word is present in stopwords or not. 
- Then it will check if the word is not a punctuation
- Then it will check if the word is not a pronoun. 
- finally the word will be converted to lower case. 

In [88]:
def text_data_cleaning(sentence):
    doc  = nlp(sentence)
    
    tokens = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            temp = token.lemma_.lower().strip() #.strip is used to remove any punctuation connected token
        else:
            temp = token.lemma_ #lemmatization of pronouns
        tokens.append(temp)
        
    cleaned_token = []
    for token in tokens:
        if token not in stopwords and token not in punct:
            cleaned_token.append(token)
    return(cleaned_token)        

In [94]:
# testing the function:

text_data_cleaning("Hi how are you? Please like the video.")

['hi', '-PRON-', 'like', 'video']

### Vectorization Feature Engineering (TF-IDF)

In [96]:
from sklearn.svm import LinearSVC

In [97]:
# instantiating TDIDF
tfidf = TfidfVectorizer(tokenizer = text_data_cleaning)

In [98]:
classifier = LinearSVC()

In [100]:
X = data["Reviews"]
y = data["Sentiments"]

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [102]:
X_train.shape, X_test.shape

((2198,), (550,))

In [103]:
clf = Pipeline([("tfidf", tfidf), ("clf", classifier)])

In [104]:
clf.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [105]:
y_pred = clf.predict(X_test)

In [106]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.80      0.79       285
           1       0.78      0.75      0.77       265

   micro avg       0.78      0.78      0.78       550
   macro avg       0.78      0.78      0.78       550
weighted avg       0.78      0.78      0.78       550



In [107]:
confusion_matrix(y_test, y_pred)

array([[228,  57],
       [ 65, 200]], dtype=int64)

In [108]:
clf.predict(["I hate you when you dont love to love me"])

array([1], dtype=int64)

In [109]:
clf.predict(["I love this product"])

array([1], dtype=int64)

In [110]:
clf.predict(["I hate this product"])

array([0], dtype=int64)