### Library n Package

In [2]:
import numpy as np
import pandas as pd
import nltk
import textblob
from textblob import TextBlob


### 1. Basic feature extraction using text data    
    a) Number of words   
    b) Number of characters   
    c) Average word length   
    d) Number of stopwords   
    e) Number of special characters   
    f) Number of numerics   
    g) Number of uppercase words   

In [51]:
# Dataset

train = pd.read_csv(dir_path+'Dataset/train_E6oV3lV.csv')

In [52]:
print(train.shape)
train.head()

(31962, 3)


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [8]:
# Creating a copy of train dataset for text analysis

df_train = train.copy()

### 1.1 Number of Words

In [9]:
df_train['word_count'] = df_train['tweet'].apply(lambda x: len(str(x).split(" ")))
df_train_sort_wordcount = df_train.sort_values(by='word_count', ascending=True)
df_train_sort_wordcount[['tweet','word_count']].head()

Unnamed: 0,tweet,word_count
1474,ðð»ðð½ðð¿ðð¼ððð¼ð...,3
7987,ðð»ðð½ðð¿ðð¼ððð¼ð...,3
22911,"@user hititonthehead! #killarypeople,needtorea...",3
16928,idc anymore.. ð,3
784,holy city#tissues !,3


* Need to find the correct #words as with the above code count a lil' more

### 1.2 Number of Characters

In [10]:
df_train['char_count'] = df_train['tweet'].str.len() ## this also includes spaces
df_train_sort_charcount = df_train.sort_values(by='char_count', ascending=True)
df_train_sort_charcount[['id','tweet','char_count']].head()

Unnamed: 0,id,tweet,char_count
25195,25196,i love you,11
24669,24670,ohh no how?,12
9130,9131,got this on,12
3397,3398,shut up rat,12
13172,13173,my king k,12


### 1.3 Average Word Length

In [11]:
def avg_word_len (sentence):
    words = sentence.split()
    avg_len = sum(len(word) for word in words)/len(words)
    return avg_len

df_train['avg_word_len'] = df_train['tweet'].apply(lambda x: avg_word_len(x))
df_train[['tweet', 'avg_word_len']].head()

Unnamed: 0,tweet,avg_word_len
0,@user when a father is dysfunctional and is s...,4.555556
1,@user @user thanks for #lyft credit i can't us...,5.315789
2,bihday your majesty,5.666667
3,#model i love u take with u all the time in ...,4.928571
4,factsguide: society now #motivation,8.0


### 1.4 Number of stopwords

In [12]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

df_train['stopwords'] = df_train['tweet'].apply(lambda x: len([i for i in x.split() if i in stop_words]))
df_train[['tweet', 'stopwords']].head()

Unnamed: 0,tweet,stopwords
0,@user when a father is dysfunctional and is s...,10
1,@user @user thanks for #lyft credit i can't us...,5
2,bihday your majesty,1
3,#model i love u take with u all the time in ...,5
4,factsguide: society now #motivation,1


### 1.5 Number of special characters

In [13]:
# Number of hashtags in a tweet

df_train['hashtags'] = df_train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
df_train[['tweet', 'hashtags']].head()

Unnamed: 0,tweet,hashtags
0,@user when a father is dysfunctional and is s...,1
1,@user @user thanks for #lyft credit i can't us...,3
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,1
4,factsguide: society now #motivation,1


### 1.6 Number of numerics

In [14]:
df_train['numeric'] = df_train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
df_train[['tweet', 'numeric']].head()

Unnamed: 0,tweet,numeric
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


### 1.7 Number of Uppercase words

In [15]:
df_train['upper_word'] = df_train['tweet'].apply(lambda x: len([x for x in x.split() if x.isupper()]))
df_train[['tweet', 'upper_word']].head()

Unnamed: 0,tweet,upper_word
0,@user when a father is dysfunctional and is s...,0
1,@user @user thanks for #lyft credit i can't us...,0
2,bihday your majesty,0
3,#model i love u take with u all the time in ...,0
4,factsguide: society now #motivation,0


In [16]:
df_train.head()

Unnamed: 0,id,label,tweet,word_count,char_count,avg_word_len,stopwords,hashtags,numeric,upper_word
0,1,0,@user when a father is dysfunctional and is s...,21,102,4.555556,10,1,0,0
1,2,0,@user @user thanks for #lyft credit i can't us...,22,122,5.315789,5,3,0,0
2,3,0,bihday your majesty,5,21,5.666667,1,0,0,0
3,4,0,#model i love u take with u all the time in ...,17,86,4.928571,5,1,0,0
4,5,0,factsguide: society now #motivation,8,39,8.0,1,1,0,0


### 2. Basic Text Pre-processing of text data
     
Lower casing    
Punctuation removal   
Stopwords removal   
Frequent words removal   
Rare words removal  
Spelling correction   
Tokenization    
Stemming   
Lemmatization


In [17]:
# Creating a copy od dataset to preprocess the data

df_train_dpp = df_train.copy()

### 2.1 Lower casing

In [18]:
df_train_dpp['tweet_lower'] = df_train_dpp['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_train_dpp[['tweet', 'tweet_lower']].head()


Unnamed: 0,tweet,tweet_lower
0,@user when a father is dysfunctional and is s...,@user when a father is dysfunctional and is so...
1,@user @user thanks for #lyft credit i can't us...,@user @user thanks for #lyft credit i can't us...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,#model i love u take with u all the time in ur...
4,factsguide: society now #motivation,factsguide: society now #motivation


### 2.2 Removing Punctuation

In [19]:
df_train_dpp['tweet_punc'] = df_train_dpp['tweet_lower'].str.replace('[^\w\s]', '')
df_train_dpp[['tweet', 'tweet_punc']].head()

Unnamed: 0,tweet,tweet_punc
0,@user when a father is dysfunctional and is s...,user when a father is dysfunctional and is so ...
1,@user @user thanks for #lyft credit i can't us...,user user thanks for lyft credit i cant use ca...
2,bihday your majesty,bihday your majesty
3,#model i love u take with u all the time in ...,model i love u take with u all the time in urð...
4,factsguide: society now #motivation,factsguide society now motivation


### 2.3 Removal of Stop Words

In [20]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

df_train_dpp['tweet_stopwords'] = df_train_dpp['tweet_punc'].apply(lambda x: " ".join(x for x in x.split() if x not in stop_words))
df_train_dpp[['tweet', 'tweet_stopwords']].head()

Unnamed: 0,tweet,tweet_stopwords
0,@user when a father is dysfunctional and is s...,user father dysfunctional selfish drags kids d...
1,@user @user thanks for #lyft credit i can't us...,user user thanks lyft credit cant use cause do...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model love u take u time urð ðððð ððð
4,factsguide: society now #motivation,factsguide society motivation


### 2.4 Common word removal

* Frequency of common words in all the tweets

In [21]:
common_top10 = pd.Series(' '.join(df_train_dpp['tweet_stopwords']).split()).value_counts()[:10]
common_top10

user     17473
love      2647
ð         2511
day       2199
â         1797
happy     1663
amp       1582
im        1139
u         1136
time      1110
dtype: int64

In [22]:
# Remove these top 10 freq words

common = list(common_top10.index)

df_train_dpp['tweet_commwords'] = df_train_dpp['tweet_stopwords'].apply(lambda x: " ".join(x for x in x.split() if x not in common))
df_train_dpp[['tweet','tweet_commwords']].head()

Unnamed: 0,tweet,tweet_commwords
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model take urð ðððð ððð
4,factsguide: society now #motivation,factsguide society motivation


### 2.5 Rare words removal

* Frequency of common words in all the tweets

In [25]:
rare_top10 = pd.Series(" ".join(df_train_dpp['tweet_commwords']).split()).value_counts()[-10:]
rare_top10

manspreadingâ      1
girfriend          1
laoag              1
2daywho            1
livewithcourage    1
girlswholift       1
updo               1
tvhostâ            1
firstbihdayâ       1
flourishingi_am    1
dtype: int64

In [26]:
# Remove these top 10 common words

rare = list(rare_top10.index)

df_train_dpp['tweet_rarewords'] = df_train_dpp['tweet_commwords'].apply(lambda x: " ".join(x for x in x.split() if x not in rare))
df_train_dpp[['tweet','tweet_rarewords']].head()

Unnamed: 0,tweet,tweet_rarewords
0,@user when a father is dysfunctional and is s...,father dysfunctional selfish drags kids dysfun...
1,@user @user thanks for #lyft credit i can't us...,thanks lyft credit cant use cause dont offer w...
2,bihday your majesty,bihday majesty
3,#model i love u take with u all the time in ...,model take urð ðððð ððð
4,factsguide: society now #motivation,factsguide society motivation


### 2.6 Spelling correction

* Using textblob

In [27]:
df_train_dpp['tweet_rarewords'][:5].apply(lambda x: str(TextBlob(x).correct()))


0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet_rarewords, dtype: object

### 2.7 Tokenization

In [28]:
df_train_dpp['tweet_rarewords'][:10].apply(lambda x: TextBlob(x).words)

0    [father, dysfunctional, selfish, drags, kids, ...
1    [thanks, lyft, credit, cant, use, cause, dont,...
2                                    [bihday, majesty]
3                        [model, take, urð, ðððð, ððð]
4                    [factsguide, society, motivation]
5    [22, huge, fan, fare, big, talking, leave, cha...
6                          [camping, tomorrow, dannyâ]
7    [next, school, year, year, examsð, cant, think...
8    [land, allin, cavs, champions, cleveland, clev...
9                                       [welcome, gr8]
Name: tweet_rarewords, dtype: object

### 2.8 Stemming

In [29]:
from nltk.stem import PorterStemmer
st = PorterStemmer()
df_train_dpp['tweet_rarewords'][:5].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet_rarewords, dtype: object

### 2.9 Lemmatization

In [30]:
from textblob import Word
df_train_dpp['tweet_rarewords'][:5].apply(lambda x: " ".join(Word(word) for word in x.split()))

0    father dysfunctional selfish drags kids dysfun...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet_rarewords, dtype: object

** Lemmatization is better than Stemming **

### 3. Advance Text Processing
     
N-grams   
Term Frequency  
Inverse Document Frequency   
Term Frequency-Inverse Document Frequency (TF-IDF)   
Bag of Words    
Sentiment Analysis    
Word Embedding

#### Extract features using NLP techniques

### 3.1 N-grams

*  Ngrams with N=1 are called Unigrams. Bigrams > (N=2), Trigrams > (N=3)

In [31]:
TextBlob(df_train_dpp['tweet_rarewords'][1]).ngrams(2)

[WordList(['thanks', 'lyft']),
 WordList(['lyft', 'credit']),
 WordList(['credit', 'cant']),
 WordList(['cant', 'use']),
 WordList(['use', 'cause']),
 WordList(['cause', 'dont']),
 WordList(['dont', 'offer']),
 WordList(['offer', 'wheelchair']),
 WordList(['wheelchair', 'vans']),
 WordList(['vans', 'pdx']),
 WordList(['pdx', 'disapointed']),
 WordList(['disapointed', 'getthanked'])]

In [32]:
# Function to generate n-grams from sentences

def extract_ngrams(data, num):
    n_grams = TextBlob(data).ngrams(num)
    return [ ' '.join(grams) for grams in n_grams]
 
data = 'A class is a blueprint for the object.'
 
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))

1-gram:  ['A', 'class', 'is', 'a', 'blueprint', 'for', 'the', 'object']
2-gram:  ['A class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object']
3-gram:  ['A class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object']
4-gram:  ['A class is a', 'class is a blueprint', 'is a blueprint for', 'a blueprint for the', 'blueprint for the object']


* Need to find n-grams for a dataframe

### 3.2 Term frequency

* TF = (Number of times term T appears in the particular row) / (number of terms in that row)

In [33]:
tf = df_train_dpp['tweet_rarewords'][1:2].apply(lambda x: pd.value_counts(x.split())/len(x.split())).sum(axis=0).reset_index()
tf.columns = ['words', 'tf']
tf

Unnamed: 0,words,tf
0,credit,0.076923
1,offer,0.076923
2,vans,0.076923
3,getthanked,0.076923
4,disapointed,0.076923
5,lyft,0.076923
6,dont,0.076923
7,cant,0.076923
8,cause,0.076923
9,pdx,0.076923


* Need to check what happens to a word which occurs in multiple sentences.

### 3.3 Inverse Document Frequency

* IDF = log(N/n), where, N is the total number of rows and n is the number of rows in which the word was present

In [34]:
for i,word in enumerate(tf['words']):
    tf.loc[i, 'idf'] = np.log(df_train_dpp.shape[0]/(len(df_train_dpp[df_train_dpp['tweet_rarewords'].str.contains(word)])))
    
tf

Unnamed: 0,words,tf,idf
0,credit,0.076923,7.327781
1,offer,0.076923,6.522155
2,vans,0.076923,8.426393
3,getthanked,0.076923,9.679156
4,disapointed,0.076923,10.372303
5,lyft,0.076923,8.762865
6,dont,0.076923,3.745585
7,cant,0.076923,3.538194
8,cause,0.076923,5.690172
9,pdx,0.076923,8.762865


### 3.4 Term Frequency – Inverse Document Frequency (TF-IDF)

* TF-IDF is the multiplication of the TF and IDF

In [35]:
tf['tfidf'] = tf['tf'] * tf['idf']
tf

Unnamed: 0,words,tf,idf,tfidf
0,credit,0.076923,7.327781,0.563675
1,offer,0.076923,6.522155,0.501704
2,vans,0.076923,8.426393,0.648184
3,getthanked,0.076923,9.679156,0.74455
4,disapointed,0.076923,10.372303,0.797869
5,lyft,0.076923,8.762865,0.674067
6,dont,0.076923,3.745585,0.288122
7,cant,0.076923,3.538194,0.272169
8,cause,0.076923,5.690172,0.437706
9,pdx,0.076923,8.762865,0.674067


* For tf-idf no need to calculate tf and idf every time.    
* sklearn TfidfVectorizer comes handy

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words= 'english',ngram_range=(1,1))
train_vect = tfidf.fit_transform(df_train_dpp['tweet_rarewords'])

train_vect

<31962x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 108912 stored elements in Compressed Sparse Row format>

* Need to build the code to see each word's tfidf from this sklearn's Tfidf Vectorizer

### 3.5 Bag of Words

* Bag of Words (BoW) refers to the representation of text which describes the presence of words within the text data.   
* The intuition behind this is that two similar text fields will contain similar kind of words, and will therefore have a similar bag of words.    
* Further, that from the text alone we can learn something about the meaning of the document.

In [37]:
from sklearn.feature_extraction.text import CountVectorizer

bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1),analyzer = "word")
train_bow = bow.fit_transform(df_train_dpp['tweet_rarewords'])
train_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 123029 stored elements in Compressed Sparse Row format>

### 3.6 Sentiment from TextBlob 

In [38]:
df_train_dpp['tweet_rarewords'][:5].apply(lambda x: TextBlob(x).sentiment)

0    (-0.5, 1.0)
1     (0.2, 0.2)
2     (0.0, 0.0)
3     (0.0, 0.0)
4     (0.0, 0.0)
Name: tweet_rarewords, dtype: object

In [39]:
df_train_dpp['sentiment'] = df_train_dpp['tweet_rarewords'][:20].apply(lambda x: TextBlob(x).sentiment[0])
df_train_dpp[['tweet','sentiment']].head(5)

Unnamed: 0,tweet,sentiment
0,@user when a father is dysfunctional and is s...,-0.5
1,@user @user thanks for #lyft credit i can't us...,0.2
2,bihday your majesty,0.0
3,#model i love u take with u all the time in ...,0.0
4,factsguide: society now #motivation,0.0


* Sentiment(polarity=0.8, subjectivity=0.75)

* We can see that polarity is 0.8, which means that the statement is positive and 0.75 subjectivity refers that mostly it is a public opinion and not a factual information.

Sentiment analysis is basically the process of determining the attitude or the emotion of the writer, i.e., whether it is positive or negative or neutral.  
The sentiment function of textblob returns two properties, polarity, and subjectivity.   
Polarity is float which lies in the range of [-1,1] where 1 means positive statement and -1 means a negative statement.    Subjective sentences generally refer to personal opinion, emotion or judgment whereas objective refers to factual information. Subjectivity is also a float which lies in the range of [0,1].   

### 3.7 Word Embeddings

Word Embedding is the representation of text in the form of vectors.    
The underlying idea here is that similar words will have a minimum distance between their vectors.

In [53]:
from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = dir_path + 'glove.6B.100d.txt'
word2vec_output_file = dir_path + 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

In [54]:
from gensim.models import KeyedVectors # load the Stanford GloVe model

filename = dir_path + 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

In [55]:
model['model']

array([-0.27663  ,  0.55094  ,  0.13618  ,  0.59683  , -0.24868  ,
       -0.34311  ,  0.7466   ,  0.12718  , -0.28819  ,  0.78735  ,
       -0.020772 , -0.48316  ,  0.23378  , -0.0064957,  0.33922  ,
        0.042282 ,  0.54477  ,  0.60575  ,  0.27294  ,  0.13011  ,
       -0.0439   , -0.51364  ,  0.2555   ,  0.33046  ,  0.43983  ,
       -0.22455  ,  0.092492 , -1.239    , -0.58248  ,  0.29795  ,
        0.12634  ,  1.3109   , -0.22366  , -0.18947  ,  0.83993  ,
        0.38651  ,  0.41031  , -0.047329 ,  0.3379   , -0.43655  ,
       -0.19865  , -0.30696  ,  0.12327  ,  0.22901  , -0.34216  ,
        0.1925   ,  0.15737  , -0.3082   , -0.39696  ,  0.080463 ,
       -0.28006  ,  0.081266 ,  0.15054  ,  1.3843   ,  0.39917  ,
       -1.997    , -0.74011  ,  0.59262  ,  1.09     , -0.11469  ,
        0.092272 ,  0.33024  , -0.61947  ,  0.63913  ,  1.1857   ,
       -0.042564 , -0.60858  , -0.3032   ,  0.16855  ,  0.56586  ,
        0.36661  ,  0.12138  , -0.060965 , -0.057785 ,  0.4706

In [56]:
model['take']

array([-2.7064e-01,  5.1896e-03,  1.4970e-01, -9.8242e-02, -3.4941e-01,
        5.3679e-02, -4.9698e-01,  6.5251e-01, -3.4078e-01, -2.3466e-01,
        9.1924e-02,  4.3280e-01, -5.2570e-02,  2.5661e-01, -7.3174e-02,
       -3.1834e-01,  2.4386e-01,  5.2261e-01, -6.4237e-01,  3.4460e-01,
        6.7449e-01, -4.1091e-01, -6.8067e-02,  1.1036e-01, -3.1174e-01,
       -1.8380e-01, -3.2548e-01, -5.6073e-01,  4.6353e-01, -3.8417e-01,
       -6.6699e-01,  5.1162e-01, -1.9582e-01, -1.6548e-01, -1.1617e-01,
        4.0172e-01, -2.7041e-01,  1.2839e-01, -2.4684e-01, -8.6713e-02,
       -5.1182e-01, -1.1955e-01,  1.4814e-01, -8.5205e-01, -4.2312e-01,
        2.7046e-01, -1.9395e-01, -1.6860e-01, -2.2328e-02, -7.9142e-01,
       -1.3786e-01,  8.4995e-02, -1.9315e-01,  1.2555e+00,  4.1198e-03,
       -2.7418e+00,  8.3024e-02, -2.0155e-01,  1.8789e+00,  7.9497e-02,
       -1.3951e-01,  8.3795e-01, -2.8992e-01,  3.5695e-02,  8.1729e-01,
        2.5042e-01,  6.9560e-02,  6.1749e-01, -3.3027e-01, -4.90

In [57]:
(model['model'] + model['take'])/2

array([-2.7363500e-01,  2.7806479e-01,  1.4294000e-01,  2.4929401e-01,
       -2.9904500e-01, -1.4471550e-01,  1.2480998e-01,  3.8984498e-01,
       -3.1448501e-01,  2.7634501e-01,  3.5575997e-02, -2.5179997e-02,
        9.0604998e-02,  1.2505715e-01,  1.3302299e-01, -1.3802901e-01,
        3.9431500e-01,  5.6418002e-01, -1.8471499e-01,  2.3735499e-01,
        3.1529498e-01, -4.6227500e-01,  9.3716495e-02,  2.2041000e-01,
        6.4044997e-02, -2.0417500e-01, -1.1649401e-01, -8.9986497e-01,
       -5.9475005e-02, -4.3109998e-02, -2.7032501e-01,  9.1126001e-01,
       -2.0974001e-01, -1.7747501e-01,  3.6188000e-01,  3.9411500e-01,
        6.9949999e-02,  4.0530499e-02,  4.5530006e-02, -2.6163149e-01,
       -3.5523501e-01, -2.1325499e-01,  1.3570499e-01, -3.1152001e-01,
       -3.8264000e-01,  2.3148000e-01, -1.8289998e-02, -2.3840000e-01,
       -2.0964399e-01, -3.5547850e-01, -2.0896000e-01,  8.3130501e-02,
       -2.1305002e-02,  1.3199000e+00,  2.0164491e-01, -2.3694000e+00,
      

### Thank You