In [1]:
import pandas as pd
import numpy as np

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
df = pd.read_csv("A:/training.1600000.processed.noemoticon.csv", encoding = "latin1", header = None)
nlp = spacy.load("en_core_web_sm")

### Detecting Entities using NER of Spacy

- Entities means Detecting the name of Person, Country, Date, Time etc

In [2]:
x = "Breaking News: Donald Trump, the President of USA is planning to sign a deal to mine the Moon."

In [3]:
doc = nlp(x)
for ent in doc.ents:
    print(ent.text + "-"+ ent.label_ + "-" + str(spacy.explain(ent.label_)))

Breaking News-ORG-Companies, agencies, institutions, etc.
Donald Trump-PERSON-People, including fictional
USA-GPE-Countries, cities, states
Moon-PERSON-People, including fictional


- We can also vizualize all the entities in a text using displacy

In [4]:
from spacy import displacy

In [5]:
displacy.render(doc, style = "ent")

- The entities can be extracted and can be used as feature vectors

### Detecting Nouns Present in the text

In [6]:
x

'Breaking News: Donald Trump, the President of USA is planning to sign a deal to mine the Moon.'

In [7]:
for noun in doc.noun_chunks:
    print(noun)

Breaking News
Donald Trump
the President
USA
a deal
the Moon


### Translation and Language Detection

- Language Code: http://www.loc.gov/standards/iso639-2/php/code_list.php

In [8]:
from textblob import TextBlob
tb = TextBlob(x)

In [9]:
tb.detect_language()

'en'

In [10]:
tb.translate(to = "hi")

TextBlob("ब्रेकिंग न्यूज: डोनाल्ड ट्रम्प, संयुक्त राज्य अमेरिका के राष्ट्रपति चंद्रमा को मेरा सौदा करने के लिए हस्ताक्षर करने की योजना बना रहे हैं।")

### Use Inbuilt Sentiment Classifier

In [11]:
from textblob.sentiments import NaiveBayesAnalyzer

In [12]:
x = "We all are standing together to fight Corona virus. We will win together"

In [13]:
tb = TextBlob(x, analyzer = NaiveBayesAnalyzer())

In [14]:
tb.sentiment

Sentiment(classification='pos', p_pos=0.800625679351197, p_neg=0.19937432064880503)

In [15]:
x = "We all are suffering from Corona"

In [16]:
tb = TextBlob(x, analyzer = NaiveBayesAnalyzer())

In [17]:
tb.sentiment

Sentiment(classification='pos', p_pos=0.8530393643704917, p_neg=0.1469606356295085)

- The Inbuilt Classifier is not very accurate. Hence we need to create our own classifier and train our model on large dataset

## Advanced Text Processing

### 1. Ngram

- N-grams are the multiple combination of words used together
- N = 1 is unigram
- N = 2 is bigram
- N = 3 is trigram and so on..

In [19]:
x = "thanks for watching this video"

In [20]:
tb = TextBlob(x)

In [27]:
tb.ngrams(2)

[WordList(['thanks', 'for']),
 WordList(['for', 'watching']),
 WordList(['watching', 'this']),
 WordList(['this', 'video'])]

### 2. Bag of Words (BOW)

- BOW models are the simplest of models to extract the features from text data.
- It creates a dictionary and put the count of words in the dictionary for all the rows.

In [38]:
x = ["This is first sentence in this sentence", "This is second sentence", "This is third sentence"]

In [39]:
from sklearn.feature_extraction.text import CountVectorizer

In [40]:
cv = CountVectorizer(ngram_range= (1,1))

In [41]:
text_count = cv.fit_transform(x)

In [42]:
text_count

<3x7 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

- The Count vectorizer works as follows:
    1. Each sentence is treated as a document
    2. The count vectorizer collects the unique words from all the documents.
    3. Then it calculates how many times, the unique words have occured in each document

In [47]:
# Count vector on display
text_count.toarray()

array([[1, 1, 1, 0, 2, 0, 2],
       [0, 0, 1, 1, 1, 0, 1],
       [0, 0, 1, 0, 1, 1, 1]], dtype=int64)

In [48]:
# get unique words
cv.get_feature_names()

['first', 'in', 'is', 'second', 'sentence', 'third', 'this']

In [78]:
# Convert to pandas dataframe
bow = pd.DataFrame(text_count.toarray(), columns = cv.get_feature_names())

In [79]:
bow

Unnamed: 0,first,in,is,second,sentence,third,this
0,1,1,1,0,2,0,2
1,0,0,1,1,1,0,1
2,0,0,1,0,1,1,1


### 3. Term Frequency

- Term frequency is simply the ratio of Number of Times a word appeared in a document divided by total number of words present in the document

In [80]:
x

['This is first sentence in this sentence',
 'This is second sentence',
 'This is third sentence']

In [81]:
bow

Unnamed: 0,first,in,is,second,sentence,third,this
0,1,1,1,0,2,0,2
1,0,0,1,1,1,0,1
2,0,0,1,0,1,1,1


In [82]:
bow.shape

(3, 7)

In [83]:
TF = bow/bow.shape[1]

In [84]:
TF

Unnamed: 0,first,in,is,second,sentence,third,this
0,0.142857,0.142857,0.142857,0.0,0.285714,0.0,0.285714
1,0.0,0.0,0.142857,0.142857,0.142857,0.0,0.142857
2,0.0,0.0,0.142857,0.0,0.142857,0.142857,0.142857


In [85]:
1/7

0.14285714285714285

- The above formula is not the correct formula for calculating the **term frequency** hence we use the below formula

In [86]:
tf = bow.copy()

In [87]:
for index, row in enumerate(tf.iterrows()):
    for col in row[1].index:
        tf.loc[index, col] = tf.loc[index, col]/sum(row[1].values)

In [88]:
tf

Unnamed: 0,first,in,is,second,sentence,third,this
0,0.142857,0.142857,0.142857,0.0,0.285714,0.0,0.285714
1,0.0,0.0,0.25,0.25,0.25,0.0,0.25
2,0.0,0.0,0.25,0.0,0.25,0.25,0.25


### Inverse Document Frequency

- The problem with bow and term frequency is that both the techniques give alot of importance to most frequently occuring word.
- Hence stop words which occur more frequently may get more emphasis than other words which are more important.

The basic intuition of IDF is that the word which is used more frequently in a document gets lets emphasis. The unique words get more emphasis.

- There are various formulas used to calculate IDF. The one used by SK learn is as follows:
    
    idf = log((1+N)/(n+1))+1 is used in sklearn when smooth_idf = True

where, N = total number of rows 
       
       n = The number of rows in which the word was present

In [65]:
import numpy as np
x

['This is first sentence in this sentence',
 'This is second sentence',
 'This is third sentence']

In [66]:
x_df = pd.DataFrame(x, columns = ["Words"])

In [67]:
x_df

Unnamed: 0,Words
0,This is first sentence in this sentence
1,This is second sentence
2,This is third sentence


In [89]:
bow

Unnamed: 0,first,in,is,second,sentence,third,this
0,1,1,1,0,2,0,2
1,0,0,1,1,1,0,1
2,0,0,1,0,1,1,1


In [90]:
N = bow.shape[0]

In [91]:
N

3

In [92]:
bb = bow.astype(bool)

In [93]:
bb

Unnamed: 0,first,in,is,second,sentence,third,this
0,True,True,True,False,True,False,True
1,False,False,True,True,True,False,True
2,False,False,True,False,True,True,True


In [94]:
bb["is"].sum() # This means the word "is" is present in all 3 documents

3

In [95]:
cols = bb.columns

In [97]:
cols

Index(['first', 'in', 'is', 'second', 'sentence', 'third', 'this'], dtype='object')

In [99]:
nz = []
for col in cols:
    nz.append(bb[col].sum())

In [100]:
nz

[1, 1, 3, 1, 3, 1, 3]

In [101]:
idf = []
for index, col in enumerate (cols):
    idf.append(np.log((N+1)/(1+nz[index]))+1)

In [102]:
idf

[1.6931471805599454,
 1.6931471805599454,
 1.0,
 1.6931471805599454,
 1.0,
 1.6931471805599454,
 1.0]

### Calculating TF-IDF using Inbuilt formula

In [103]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [104]:
tfidf = TfidfVectorizer()

In [106]:
x_tfidf = tfidf.fit_transform(x_df["Words"])

In [107]:
x_tfidf.toarray()

array([[0.44110484, 0.44110484, 0.26052363, 0.        , 0.52104725,
        0.        , 0.52104725],
       [0.        , 0.        , 0.41285857, 0.69903033, 0.41285857,
        0.        , 0.41285857],
       [0.        , 0.        , 0.41285857, 0.        , 0.41285857,
        0.69903033, 0.41285857]])

In [109]:
tfidf.idf_

array([1.69314718, 1.69314718, 1.        , 1.69314718, 1.        ,
       1.69314718, 1.        ])

### Word Embeddings

- Word embeddings is a representation of **text** in the form of vectors
- The ML algorithm works only on numerical data. 
- So feature extraction techniques such as BOW, Ngrams, Wordembeddings, TF, IDF, TF-IDF help us to convert words to vectors

### For Word Embedding, we will be making use of Spacy Word2Vec

In [117]:
import spacy
nlp = spacy.load("en_core_web_sm")

- The spacy library has vectors for more than 200 million words or texts. 
- Lets check if the following words have vectors in spacy library

In [144]:
doc = nlp("Thank You! dog cat lion dsasdf")

In [145]:
# lets check if each word in doc has a vector in spacy library

for token in doc:
    print(token.text , token.has_vector)

Thank True
You True
! True
dog True
cat True
lion True
dsasdf True


In [146]:
## Calculating the size of vectors associated with the word

nlp("cat").vector

array([-2.1064289e+00,  2.9962975e-01, -2.2410669e+00, -1.9169322e+00,
       -1.2184558e+00,  6.1042982e-01, -1.1767293e+00,  3.1830788e+00,
        4.2653184e+00, -1.1056970e+00,  3.4563899e-01, -1.1708775e-01,
        1.0620141e+00, -1.5057577e+00, -2.5000391e+00,  2.3436433e-01,
       -1.2607279e+00, -2.1381488e+00, -1.8432605e+00, -2.1306306e-02,
        2.0283775e+00,  1.1143034e+00,  2.1585524e+00,  2.1318121e+00,
        1.1857803e+00,  8.9906502e-01,  1.1214937e+00,  2.7884957e-01,
        4.0419281e-01,  1.1440701e+00, -1.6959293e+00, -3.5782902e+00,
        6.0869837e-01, -5.5951458e-01, -1.1826438e+00,  1.5249169e-01,
        7.5581133e-01, -1.4273125e+00, -3.3585079e+00,  1.1555356e+00,
       -1.5454226e+00,  1.2355652e+00, -1.0535346e+00,  5.1444755e+00,
       -1.9380704e+00, -4.3583474e+00, -3.7239835e-01,  8.7266159e-01,
       -1.6602010e-02, -3.9437079e+00, -3.2916846e+00,  3.7859478e+00,
       -3.1600520e-01,  2.6734476e+00, -6.5972328e-01,  8.6227185e-01,
      

In [147]:
nlp("cat").vector.shape

(96,)

### Calculating the similarity between these words using the above vector technique

- If the angle between 2 vectors is less than 90 then they have high similarity
- If the angle between 2 vectors is 90 then they are not similar to one another.

In [148]:
for token1 in doc:
    for token2 in doc:
        print(token1.text, token2.text, token1.similarity(token2))
    print()

Thank Thank 1.0
Thank You 0.044279758
Thank ! -0.0037732057
Thank dog 0.17940184


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


Thank cat 0.047102816
Thank lion 0.15755837
Thank dsasdf 0.1877713

You Thank 0.044279758
You You 1.0
You ! 0.22056843


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


You dog 0.1331808
You cat 0.09001851
You lion 0.12837133
You dsasdf 0.032905307



  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


! Thank -0.0037732057
! You 0.22056843
! ! 1.0
! dog 0.1887487
! cat 0.044803303


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


! lion -0.00029753448
! dsasdf -0.0061445185

dog Thank 0.17940184
dog You 0.1331808
dog ! 0.1887487
dog dog 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


dog cat 0.4410823
dog lion 0.3119457
dog dsasdf 0.40510246

cat Thank 0.047102816
cat You 0.09001851


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


cat ! 0.044803303
cat dog 0.4410823
cat cat 1.0
cat lion 0.3998844
cat dsasdf 0.38268736

lion Thank 0.15755837


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


lion You 0.12837133
lion ! -0.00029753448
lion dog 0.3119457
lion cat 0.3998844
lion lion 1.0


  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


lion dsasdf 0.4242544

dsasdf Thank 0.1877713
dsasdf You 0.032905307
dsasdf ! -0.0061445185
dsasdf dog 0.40510246
dsasdf cat 0.38268736
dsasdf lion 0.4242544
dsasdf dsasdf 1.0



  "__main__", mod_spec)
  "__main__", mod_spec)
  "__main__", mod_spec)


## Machine Learning Models For Text Classification

#### 1. How to use Bag of Words method to identify sentiments in Twitter data.

In [149]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [151]:
# This is a very large dataset. We need a smaller dataset
df.shape

(1600000, 6)

In [None]:
# We take 2000 samples of negative sentiments and 2000 samples of positive sentiments

df0 = df[df["sentiment"]== 0].sample(2000)
df4 = df[df["sentiment"]== 4].sample(2000)

In [None]:
#We combine the 2 datasets together to get a reduced dataframe of 4000 samples

dfr = df0.append(df4)

### Feature Extraction: BOW

In [None]:
# First we get rid of unnecessary columns. We are combining all the manually extracted features together

dfr_feat = dfr.drop(df["emails", "sentiments", "tweets"], axis = 1).reset_index(drop = True)

In [None]:
# taking the value of y

y = dfr["sentiments"]

In [154]:
# Use BOW

from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()
text_count = cv.fit_transform(dfr["tweets"])

In [None]:
# We get the bag of words array of vectors
text_count.toarray()

In [None]:
#Shape of bow
text_count.toarray().shape

#The shape is (4000, 9750) that means there are 4000 rows and 9750 unique words

In [None]:
# Converting the array to dataframe
dfr_bog = pd.DataFrame(text_count.toarray(), columns = cv.get_feature_names())

### ML Algorithms

In [155]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV # logreg with cross validation
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [156]:
# Instantiating all the algorithms [n_jobs = -1 means use all the available CPUs]
sgd = SGDClassifier(n_jobs=-1, random_state=42, max_iter=200)
lgr = LogisticRegression(random_state=42, max_iter=200)
lgrcv = LogisticRegressionCV(cv = 2, random_state=42, max_iter=1000)
svm = LinearSVC(random_state=42, max_iter=200)
rfc = RandomForestClassifier(random_state=42, n_jobs=-1, n_estimators=200)

In [158]:
# We create a dictionary so that we can train all the algorithms using a single for loop

clf = {"SGD" : sgd, "LGR" : lgr, "LGR-CV" : lgrcv, "SVM" : svm, "RFC" : rfc}

In [159]:
clf.keys()

dict_keys(['SGD', 'LGR', 'LGR-CV', 'SVM', 'RFC'])

#### Training and calculating accuracy

In [None]:
# Creating a function for train test split
def classify (X,y):
    scaler = MinMaxScaler(feature_range= (0,1)) # All the features/parameters are scaled down to one range
    X = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42, stratify = y)

    
# Writing a for loop to train all the classifiers together    
    for key in clf.keys():
        clf[key].fit(X_train, y_train)
        y_pred = clf[key].predict(X_test)
        ac = accuracy_score(y_test, y_pred)
        print(key, "---->", ac)

In [None]:
# this will start giving the accuracy score for each algorithm
classify(dfr_bog , y)

In [None]:
Highest accuracy is Linear regression = 65.325%

### Calculating ACCURACY using manually extracted features

In [None]:
# this will start giving the accuracy score for each algorithm
classify (dfr_feat , y)

In [None]:
Highest accuracy is Linear regression Cross Validation = 65%

- Since the dataset is small, the accuracy has decreased. If we use larger dataset, accuracy will increase.

### Calculating Accuracy using Manual + BOW

In [None]:
X = dfr_feat.join(dfr_bog)

In [None]:
classify(X,y)

In [None]:
Highest accuracy is Random Forest Classifier = 70.50%

### Calculating Accuracy using TF-IDF

In [161]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# Dataset of 4000 extracted data
dfr.shape

In [None]:
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(dfr["tweets"])
X = pd.DataFrame(X.toarray())

In [None]:
classify (X , y)

In [None]:
Highest accuracy is Linear regression = 65.1250%

### Calculating Accuracy using Word2Vec

In [162]:
# create a function to vectorize all the tweets
def get_vec(x):
    doc = nlp(x)
    return doc.vector.reshape(1,-1)

In [None]:
dfr["vec"] = dfr["tweets"].apply(lambda x : get_vec(x))

In [None]:
# Join all the vectors to into a single array
X = np.concatenate(dfr["vec"].to_numpy(), axis = 0)

In [None]:
X = pd.DataFrame(X)

In [None]:
classify (X, y)

In [None]:
Highest accuracy is Linear regression = 70.1250%

### Predicting Sentiments

In [None]:
def predict_w2v(x):
    for key in clf[keys]:
        y_pred = clf[keys].predict(get_vec(x))
        print (key, "--->", y_pred)

- In this case, the prediction is done without any preprocessing.