# Table of Contents

### I. Loading and Preprocessing Data
### II. Extracting Text based Features
> ##### 1. Special Characters and Numbers
> ##### 2. Word Count
> ##### 3. Number of Characters
> ##### 4. Average Word Length
> ##### 5. Stop words
> ##### 6. POS tags
> ##### 7. NER

# I. Loading and Preprocessing Data

In [150]:
# from google.colab import drive
# drive.mount('/content/drive')

In [152]:
# Import libraries
import pandas as pd
import re
from collections import Counter

In [154]:
# Import spacy library
import spacy
# Import stopwords from spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load English language model
nlp = spacy.load('en_core_web_sm')

In [156]:
# Load dataset
df = pd.read_csv('tweets.csv', nrows=1000)

In [158]:
# Explore dataset
df.head()

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted,ner
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False,2
1,RT @Hemant_80: Did you vote on #Demonetization...,False,0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66,True,False,2
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,2016-11-23 18:40:03,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False,3
3,RT @ANI_news: Gurugram (Haryana): Post office ...,False,0,,2016-11-23 18:39:59,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338,True,False,3
4,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,2016-11-23 18:39:39,False,,8.014954e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False,2


In [160]:
# Only keep text column
df.drop(df.columns[1:], axis=1, inplace=True)

In [None]:
# Dataframe
df.head()

In [114]:
# Example Tweet
df.loc[0,'text']

"RT @rssurjewala: Critical question: Was PayTM informed about #Demonetization edict by PM? It's clearly fishy and requires full disclosure &amp;\x85"

In [115]:
# Example Tweet
df.loc[512,'text']

'RT @smita_muk: BREAKING NEWS\r\nPMapps result amnounced!\r\n90% Indians support #demonetization\r\n<ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><U+270C><U+270C><U+270C><U+270C><U+270C><U+270C><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086>\r\n@narendramodi Zindabad!'

In [116]:
# Preprocess tweets
def preprocess(text):

    # Remove unicode characters
    text = re.sub(r"<U\+[A-Z0-9]+>|<ed>", "", text)
    # Remove newline and rawstring characters
    text = re.sub(r"\n|\r", "", text)

    return text

In [117]:
# Apply function
df['text'] = df['text'].apply(preprocess)

In [118]:
# Print dataframe
df.head()

Unnamed: 0,text
0,RT @rssurjewala: Critical question: Was PayTM ...
1,RT @Hemant_80: Did you vote on #Demonetization...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...
4,RT @satishacharya: Reddy Wedding! @mail_today ...


In [119]:
df.shape

(1000, 1)

# II. Extracting Text based Features

## 1. Special characters

### 1.1 Number of mentions used in Tweets

In [120]:
# Function to count number of mentions in Tweet
def mentions(text):
    
    # Find mentions
    mentions = re.findall('@\w+', text)

    # Return count of mentions
    return len(mentions)

In [121]:
# Apply function
df['mentions_count'] = df['text'].apply(mentions)

In [122]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1
1,RT @Hemant_80: Did you vote on #Demonetization...,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2
5,@DerekScissors1: Indias #demonetization: #Bla...,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1
7,RT @Joydeep_911: Calling all Nationalists to j...,1
8,RT @sumitbhati2002: Many opposition leaders ar...,2
9,National reform now destroyed even the essence...,0


In [123]:
# Describe
df['mentions_count'].describe()

count    1000.000000
mean        0.866000
std         0.988444
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         8.000000
Name: mentions_count, dtype: float64

### 1.2 Number of hashtags used in Tweets

In [124]:
# Function to count number of hashtags in Tweet
def hashtags(text):
    
    # Find hashtags
    hashtags = re.findall('#\w+', text)

    # Return count of hashtags
    return len(hashtags)

In [125]:
# Apply function
df['hashtags_count'] = df['text'].apply(hashtags)

In [126]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1
9,National reform now destroyed even the essence...,0,1


In [127]:
# Describe
df['hashtags_count'].describe()

count    1000.000000
mean        1.688000
std         1.272114
min         0.000000
25%         1.000000
50%         1.000000
75%         2.000000
max        10.000000
Name: hashtags_count, dtype: float64

### 1.3 Number of name titles in Tweet

In [128]:
# Function to count name titles Tweet
def title(text):
    count = re.findall('Mr\.|Mrs\.|Dr\.|Miss\s*', text)
    return len(count)

In [129]:
# Test output
df['text'].apply(title)

0      0
1      0
2      0
3      0
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: text, Length: 1000, dtype: int64

## 2.1 Word Count

In [130]:
# List comprehension to count number of words in Tweet
df['word_count'] = [len(i.split()) for i in df['text']]

In [131]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17
9,National reform now destroyed even the essence...,0,1,18


In [132]:
# Describe
df['word_count'].describe()

count    1000.000000
mean       16.685000
std         4.566468
min         3.000000
25%        14.000000
50%        17.000000
75%        20.000000
max        28.000000
Name: word_count, dtype: float64

## 2.2 Number of Characters

In [133]:
# List comprehension to count number of characters in Tweet
df['character_count'] = [len(i) for i in df['text']]

In [134]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139
9,National reform now destroyed even the essence...,0,1,18,140


In [135]:
# Describe
df['character_count'].describe()

count    1000.000000
mean      124.673000
std        21.427861
min        34.000000
25%       117.000000
50%       135.000000
75%       139.000000
max       148.000000
Name: character_count, dtype: float64

Note that spaces are being couted above.  Could create a new feture counting the spaces.

### Spaces Exercise

In [136]:
# Function to count name titles Tweet
def tmp_spaces(text):
    count = re.findall('\s', text)
    return len(count)

In [137]:
tmp_text = "How many spaces are there in this?"
tmp_spaces(tmp_text)

6

In [138]:
# Apply function
df['spaces_count'] = df['text'].apply(tmp_spaces)
df.head()

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,spaces_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,20
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,10
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,21
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,16
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,8


In [139]:
# Describe
df['spaces_count'].describe()

count    1000.000000
mean       16.203000
std         4.751881
min         3.000000
25%        13.000000
50%        17.000000
75%        20.000000
max        28.000000
Name: spaces_count, dtype: float64

## 3. Average Word Length

In [140]:
# Function to calculate average word length of a Tweet
def avg_word_len(text):
    
    # Variable to store word lengths
    word_lens = 0
    
    # Iterate over all the words in Tweet
    for token in text.split():
        word_lens += len(token)
    
    # Number of words in Tweet
    word_count = text.split()

    # Return average length of words in Tweet
    return word_lens/len(word_count)

In [141]:
# Apply function
df['avg_word_len'] = df['text'].apply(avg_word_len)

In [142]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,spaces_count,avg_word_len
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,20,6.2
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,10,5.090909
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,21,5.571429
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,16,7.75
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,8,11.0
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,11,9.166667
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,22,5.5
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,18,6.722222
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,17,7.176471
9,National reform now destroyed even the essence...,0,1,18,140,18,6.777778


In [143]:
# Describe
df['avg_word_len'].describe()

count    1000.000000
mean        6.865341
std         1.698611
min         3.892857
25%         5.650000
50%         6.578947
75%         7.648810
max        16.666667
Name: avg_word_len, dtype: float64

## 4. Stopwords

In [144]:
# Function to count the number of stopwords in Tweets
def stopwords(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store count of stopwords
    count = 0
    for token in doc:
        if token.is_stop == True:
            count += 1
    return count

In [145]:
# Apply function
df['stopwords'] = df['text'].apply(stopwords)

In [146]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,spaces_count,avg_word_len,stopwords
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,20,6.2,7
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,10,5.090909,4
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,21,5.571429,5
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,16,7.75,2
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,8,11.0,0
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,11,9.166667,4
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,22,5.5,11
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,18,6.722222,8
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,17,7.176471,8
9,National reform now destroyed even the essence...,0,1,18,140,18,6.777778,7


In [147]:
# Describe
df['stopwords'].describe()

count    1000.000000
mean        6.038000
std         3.285019
min         0.000000
25%         4.000000
50%         6.000000
75%         8.000000
max        19.000000
Name: stopwords, dtype: float64

## 5. POS tags

In [148]:
# Function to calculate the number of specific POS tags
def pos(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store the count of POS tags
    count = 0

    # Iterate over the tokens
    for token in doc:
        # Count Noun, Preposition, Adjective
        if token.pos_ in ["NOUN","ADP","ADJ"]:
            count += 1
    
    # Return the count
    return count

In [149]:
# Apply function
df['pos'] = df['text'].apply(pos)

In [151]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,spaces_count,avg_word_len,stopwords,pos
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,20,6.2,7,10
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,10,5.090909,4,5
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,21,5.571429,5,4
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,16,7.75,2,11
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,8,11.0,0,5
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,11,9.166667,4,6
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,22,5.5,11,13
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,18,6.722222,8,7
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,17,7.176471,8,7
9,National reform now destroyed even the essence...,0,1,18,140,18,6.777778,7,10


In [153]:
# Describe
df['pos'].describe()

count    1000.000000
mean        7.216000
std         3.100477
min         0.000000
25%         5.000000
50%         7.000000
75%         9.000000
max        17.000000
Name: pos, dtype: float64

## 6. NER (Named Entity Recognition)

In [155]:
# Function to count NER
def ner(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store the count of POS tags
    count = 0

    # Iterate over the tokens
    for ent in doc.ents:
        # Increment counter if token is a NER
        if ent.label_:
            count += 1
    # Return count
    return count

In [167]:
# Apply function
df['ner'] = df['text'].apply(ner)

In [168]:
# Print features
df.head(10)

Unnamed: 0,text,ner
0,RT @rssurjewala: Critical question: Was PayTM ...,2
1,RT @Hemant_80: Did you vote on #Demonetization...,2
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",3
3,RT @ANI_news: Gurugram (Haryana): Post office ...,3
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2
5,@DerekScissors1: Indias #demonetization: #Bla...,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,3
7,RT @Joydeep_911: Calling all Nationalists to j...,2
8,RT @sumitbhati2002: Many opposition leaders ar...,0
9,National reform now destroyed even the essence...,1


In [169]:
# Describe
df['ner'].describe()

count    1000.000000
mean        2.471000
std         1.503136
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         7.000000
Name: ner, dtype: float64

TypeError: Argument 'string' has incorrect type (expected str, got Series)