In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
train = pd.read_csv("train_E6oV3lV.csv")

In [43]:
train['tweet'].head()

0    father dysfunctional selfish drag kid dysfunct...
1    thanks lyft credit cant use cause dont offer w...
2                                       bihday majesty
3                              model take urð ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [4]:
#Count the number of words in each line
train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))

In [5]:
bad_word = " "

#codes to calculate characters in the every line, including white-spaces
#train['char_count'] = train['tweet'].apply(lambda x: len([letter for letter in x]))
#train['char_count'] = train['tweet'].str.len()

#code to calculate characters in every line without white-spaces
train['char_count'] = train['tweet'].apply(lambda x: len([letter for letter in x if letter not in bad_word]))

In [6]:
#Average word length

def avg_len(word):
    word1 = word.split()
    return (sum(len(words) for words in word1)/len(word1))

train['avg_word_len'] = train['tweet'].apply(lambda x: avg_len(x))

In [7]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

In [8]:
#Counting stopwords in each tweet
train['stopwords'] = train['tweet'].apply(lambda x:len([x for x in x.split() if x in stop]))

In [9]:
#To find # and @ in every tweet
train['No. of #'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('#')]))
train['No. of @'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.startswith('@')]))

In [10]:
#Count the digits in every tweet
train['numeric'] = train['tweet'].apply(lambda x: len([x for x in x.split() if x.isdigit()]))

Basic Pre-Processing

In [11]:
#Converting all tweets in lower case
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [12]:
#To remove all the punctuations, ^\w\s is regular expression, read about it
train['tweet'] = train['tweet'].str.replace('[^\w\s]','')

In [13]:
#To remove all the stopwords
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

In [14]:
#Removing common words, as sometimes they don't contribute much to the analysis
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[:10]
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [15]:
#Removing all the rare words
freq = pd.Series(' '.join(train['tweet']).split()).value_counts()[-10:]
freq = list(freq.index)
train['tweet'] = train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))

In [20]:
from textblob import TextBlob

#Spelling correction. However, it takes a lot of time that is why we are doing it on first 5 tweets
train['tweet'][:5].apply(lambda x:str(TextBlob(x).correct()))
#There are certain problem with this, like your is often written as ur in text messages which get corrected as or and not your.

0    father dysfunctional selfish drags kiss dysfun...
1    thanks left credit can use cause dont offer wh...
2                                       midday majesty
3                               model take or ðððð ððð
4                        factsguide society motivation
Name: tweet, dtype: object

In [36]:
#Tokenization: dividing the text into a sequence of words or sentences.
TextBlob(text=train['tweet'][1]).words

WordList(['thanks', 'lyft', 'credit', 'cant', 'use', 'cause', 'dont', 'offer', 'wheelchair', 'vans', 'pdx', 'disapointed', 'getthanked'])

In [37]:
#Stemming: removal of suffices like 'ing', 'ly' etc
from nltk.stem import PorterStemmer
st = PorterStemmer()

In [38]:
train['tweet'][:5].apply(lambda x: " ".join(st.stem(word) for word in x.split()))

0        father dysfunct selfish drag kid dysfunct run
1    thank lyft credit cant use caus dont offer whe...
2                                       bihday majesti
3                              model take urð ðððð ððð
4                              factsguid societi motiv
Name: tweet, dtype: object

In [41]:
#Lemmatization: converts the word into its root word, rather than just striping the suffices. It makes use of the vocabulary and does a morphological analysis to obtain the root word.
from textblob import Word
train['tweet'] = train['tweet'].apply(lambda x: ' '.join(Word(word).lemmatize() for word in x.split()))

Advance Text Processing

In [44]:
#N-grams are the combination of multiple words used together. The basic principle behind n-grams is that they capture the language structure like what letter or word is likely to follow the given one.
TextBlob(train['tweet'][0]).ngrams(2)

[WordList(['father', 'dysfunctional']),
 WordList(['dysfunctional', 'selfish']),
 WordList(['selfish', 'drag']),
 WordList(['drag', 'kid']),
 WordList(['kid', 'dysfunction']),
 WordList(['dysfunction', 'run'])]

In [45]:
#Term frequency: is the ratio of the count of a word present in a sentence to the length of the sentence.
tf1 = (train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(' '))).sum(axis=0).reset_index()
tf1.columns = ['words','tf']
tf1

Unnamed: 0,words,tf
0,offer,1
1,disapointed,1
2,wheelchair,1
3,thanks,1
4,credit,1
5,lyft,1
6,use,1
7,dont,1
8,van,1
9,getthanked,1


In [88]:
#Inverse Document Frequency: the intuition behind IDF is that a word is not of much use to us if it's appearing in all the docuents
#IDF is the log of the ratio of the total number of rows to the number of rows in which that word is present. IDF = log(N/n) where N is the total number of rows and n is the number of rows to the number of rows in which that word is present

for i, word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(train.shape[0]/(len(train[train['tweet'].str.contains(word)])))
tf1

Unnamed: 0,words,tf,idf
0,offer,1,6.522155
1,disapointed,1,10.372303
2,wheelchair,1,9.273691
3,thanks,1,4.597751
4,credit,1,7.327781
5,lyft,1,8.762865
6,use,1,3.552287
7,dont,1,3.745585
8,van,1,5.236505
9,getthanked,1,9.679156


In [100]:
#TF-IDF: penalizes commonly occurring words
tf1['tfidf'] = tf1['tf']*tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,offer,1,6.522155,6.522155
1,disapointed,1,10.372303,10.372303
2,wheelchair,1,9.273691,9.273691
3,thanks,1,4.597751,4.597751
4,credit,1,7.327781,7.327781
5,lyft,1,8.762865,8.762865
6,use,1,3.552287,3.552287
7,dont,1,3.745585,3.745585
8,van,1,5.236505,5.236505
9,getthanked,1,9.679156,9.679156


In [104]:
#sklearn to calculate tfidf
#it does pre-processing steps as well

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words='english', ngram_range=(1,1))
train_vect = tfidf.fit_transform(train['tweet'])
train_vect

<31962x1000 sparse matrix of type '<class 'numpy.float64'>'
	with 114037 stored elements in Compressed Sparse Row format>

In [107]:
#Bag of Words: refers to the representation of text which describes the presence of words within the text data. 
#The intuition behind this is that two similar text fields will contain similar kind of words, and will therefore have a similar text bag of words.
from sklearn.feature_extraction.text import CountVectorizer
bow = CountVectorizer(max_features=1000, lowercase=True, ngram_range=(1,1), analyzer='word')
train_bow = bow.fit_transform(train['tweet'])
train_bow

<31962x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 128382 stored elements in Compressed Sparse Row format>

In [108]:
#Sentiment Analysis
train['sentiment'] = train['tweet'].apply(lambda x: TextBlob(x).sentiment[0])

In [None]:
#Word Embeddings: representation of text in the form of vectors.
#The underlying idea here is that similar words will have a minimum distance between their vectors.

#just copied these from the website

from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.100d.txt'
word2vec_output_file = 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)