<a href="https://colab.research.google.com/github/arutraj/.githubcl/blob/main/3_Text_Feature_Engineering_Implementation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table of Contents

### I. Loading and Preprocessing Data
### II. Extracting Text based Features
> ##### 1. Special Characters and Numbers
> ##### 2. Word Count
> ##### 3. Number of Characters
> ##### 4. Average Word Length
> ##### 5. Stop words
> ##### 6. POS tags
> ##### 7. NER

# I. Loading and Preprocessing Data

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# Import libraries
import pandas as pd
import re

In [2]:
# Import spacy library
import spacy
# Import stopwords from spacy
from spacy.lang.en.stop_words import STOP_WORDS

# Load English language model
nlp = spacy.load('en_core_web_sm')

In [4]:
# Load dataset with 1000 rows
df = pd.read_csv(r'/content/tweets.csv', nrows=1000)

In [5]:
# Explore dataset
df.head()

Unnamed: 0,text,favorited,favoriteCount,replyToSN,created,truncated,replyToSID,id,replyToUID,statusSource,screenName,retweetCount,isRetweet,retweeted
0,RT @rssurjewala: Critical question: Was PayTM ...,False,0,,2016-11-23 18:40:30,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",HASHTAGFARZIWAL,331,True,False
1,RT @Hemant_80: Did you vote on #Demonetization...,False,0,,2016-11-23 18:40:29,False,,8.014957e+17,,"<a href=""http://twitter.com/download/android"" ...",PRAMODKAUSHIK9,66,True,False
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",False,0,,2016-11-23 18:40:03,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",rahulja13034944,12,True,False
3,RT @ANI_news: Gurugram (Haryana): Post office ...,False,0,,2016-11-23 18:39:59,False,,8.014955e+17,,"<a href=""http://twitter.com/download/android"" ...",deeptiyvd,338,True,False
4,RT @satishacharya: Reddy Wedding! @mail_today ...,False,0,,2016-11-23 18:39:39,False,,8.014954e+17,,"<a href=""http://cpimharyana.com"" rel=""nofollow...",CPIMBadli,120,True,False


In [6]:
# Only keep text column
df.drop(df.columns[1:], axis=1, inplace=True)

In [7]:
# Dataframe
df.head()

Unnamed: 0,text
0,RT @rssurjewala: Critical question: Was PayTM ...
1,RT @Hemant_80: Did you vote on #Demonetization...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...
4,RT @satishacharya: Reddy Wedding! @mail_today ...


In [9]:
# Example Tweet
df.loc[3,'text']

'RT @ANI_news: Gurugram (Haryana): Post office employees provide cash exchange to patients in hospitals #demonetization https://t.co/uGMxUP9\x85'

In [10]:
# Example Tweet
df.loc[512,'text']

'RT @smita_muk: BREAKING NEWS\r\nPMapps result amnounced!\r\n90% Indians support #demonetization\r\n<ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><ed><U+00A0><U+00BD><ed><U+00B1><U+008F><U+270C><U+270C><U+270C><U+270C><U+270C><U+270C><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086><ed><U+00A0><U+00BD><ed><U+00B1><U+0086>\r\n@narendramodi Zindabad!'

In [11]:
# Preprocess tweets
def preprocess(text):

    # Remove unicode characters
    text = re.sub(r"<U\+[A-Z0-9]+>|<ed>", "", text)
    # Remove newline and rawstring characters
    text = re.sub(r"\n|\r", "", text)

    return text

In [12]:
# Apply function
df['text'] = df['text'].apply(preprocess)

In [13]:
# Print dataframe
df.head()

Unnamed: 0,text
0,RT @rssurjewala: Critical question: Was PayTM ...
1,RT @Hemant_80: Did you vote on #Demonetization...
2,"RT @roshankar: Former FinSec, RBI Dy Governor,..."
3,RT @ANI_news: Gurugram (Haryana): Post office ...
4,RT @satishacharya: Reddy Wedding! @mail_today ...


In [14]:
df.shape

(1000, 1)

# II. Extracting Text based Features

## 1. Special characters

### 1.1 Number of mentions used in Tweets

In [16]:
# Function to count number of mentions in Tweet .. pattern 1 or more word
def mentions(text):

    # Find mentions
    mentions = re.findall('@\w+', text)

    # Return count of mentions
    return len(mentions)

In [17]:
# Apply function
df['mentions_count'] = df['text'].apply(mentions)

In [18]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1
1,RT @Hemant_80: Did you vote on #Demonetization...,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2
5,@DerekScissors1: Indias #demonetization: #Bla...,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1
7,RT @Joydeep_911: Calling all Nationalists to j...,1
8,RT @sumitbhati2002: Many opposition leaders ar...,2
9,National reform now destroyed even the essence...,0


In [19]:
# Describe
df['mentions_count'].describe()

Unnamed: 0,mentions_count
count,1000.0
mean,0.866
std,0.988444
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,8.0


### 1.2 Number of hashtags used in Tweets

In [20]:
# Function to count number of hashtags in Tweet
def hashtags(text):

    # Find hashtags
    hashtags = re.findall('#\w+', text)

    # Return count of hashtags
    return len(hashtags)

In [21]:
# Apply function
df['hashtags_count'] = df['text'].apply(hashtags)

In [22]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1
9,National reform now destroyed even the essence...,0,1


In [23]:
# Describe
df['hashtags_count'].describe()

Unnamed: 0,hashtags_count
count,1000.0
mean,1.688
std,1.272114
min,0.0
25%,1.0
50%,1.0
75%,2.0
max,10.0


### 1.3 Number of name titles in Tweet

In [24]:
# Function to count name titles Tweet
def title(text):
    count = re.findall('Mr\.|Mrs\.|Dr\.|Miss\s*', text)
    return len(count)

In [25]:
# Test output
df['text'].apply(title)

Unnamed: 0,text
0,0
1,0
2,0
3,0
4,0
5,0
6,0
7,0
8,0
9,0


## 2.1 Word Count

In [26]:
# List comprehension to count number of words in Tweet
df['word_count'] = [len(i.split()) for i in df['text']]

In [27]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17
9,National reform now destroyed even the essence...,0,1,18


In [28]:
# Describe
df['word_count'].describe()

Unnamed: 0,word_count
count,1000.0
mean,16.685
std,4.566468
min,3.0
25%,14.0
50%,17.0
75%,20.0
max,28.0


## 2.2 Number of Characters

In [29]:
# List comprehension to count number of characters in Tweet
df['character_count'] = [len(i) for i in df['text']]

In [30]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139
9,National reform now destroyed even the essence...,0,1,18,140


In [31]:
# Describe
df['character_count'].describe()

Unnamed: 0,character_count
count,1000.0
mean,124.673
std,21.427861
min,34.0
25%,117.0
50%,135.0
75%,139.0
max,148.0


## 3. Average Word Length

In [32]:
# Function to calculate average word length of a Tweet
def avg_word_len(text):

    # Variable to store word lengths
    word_lens = 0

    # Iterate over all the words in Tweet
    for token in text.split():
        word_lens += len(token)

    # Number of words in Tweet
    word_count = text.split()

    # Return average length of words in Tweet
    return word_lens/len(word_count)

In [33]:
# Apply function
df['avg_word_len'] = df['text'].apply(avg_word_len)

In [34]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,5.571429
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,7.75
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,11.0
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,9.166667
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,5.5
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,6.722222
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,7.176471
9,National reform now destroyed even the essence...,0,1,18,140,6.777778


In [35]:
# Describe
df['avg_word_len'].describe()

Unnamed: 0,avg_word_len
count,1000.0
mean,6.865341
std,1.698611
min,3.892857
25%,5.65
50%,6.578947
75%,7.64881
max,16.666667


## 4. Stopwords

In [36]:
# Function to count the number of stopwords in Tweets
def stopwords(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store count of stopwords
    count = 0
    for token in doc:
        if token.is_stop == True:
            count += 1
    return count

In [37]:
# Apply function
df['stopwords'] = df['text'].apply(stopwords)

In [38]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,5.571429,5
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,7.75,2
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,11.0,0
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,9.166667,4
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,5.5,11
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,6.722222,8
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,7.176471,8
9,National reform now destroyed even the essence...,0,1,18,140,6.777778,7


In [39]:
# Describe
df['stopwords'].describe()

Unnamed: 0,stopwords
count,1000.0
mean,6.038
std,3.285019
min,0.0
25%,4.0
50%,6.0
75%,8.0
max,19.0


## 5. POS tags

In [40]:
# Function to calculate the number of specific POS tags
def pos(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store the count of POS tags
    count = 0

    # Iterate over the tokens
    for token in doc:
        # Count Noun, Preposition, Adjective
        if token.pos_ in ["NOUN","ADP","ADJ"]:
            count += 1

    # Return the count
    return count

In [41]:
# Apply function
df['pos'] = df['text'].apply(pos)

In [42]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords,pos
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7,9
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4,4
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,5.571429,5,6
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,7.75,2,10
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,11.0,0,6
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,9.166667,4,6
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,5.5,11,11
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,6.722222,8,7
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,7.176471,8,10
9,National reform now destroyed even the essence...,0,1,18,140,6.777778,7,10


In [43]:
# Describe
df['pos'].describe()

Unnamed: 0,pos
count,1000.0
mean,7.657
std,3.096601
min,0.0
25%,5.0
50%,8.0
75%,10.0
max,19.0


## 6. NER

In [45]:
# Function to count NER - people, places, orgs etc
def ner(text):

    # Create spacy object
    doc = nlp(text)

    # Variable to store the count of POS tags
    count = 0

    # Iterate over the tokens
    for ent in doc.ents:
        # Increment counter if token is a NER
        if ent.label_:
            count += 1
    # Return count
    return count

In [46]:
# Apply function
df['ner'] = df['text'].apply(ner)

In [47]:
# Print features
df.head(10)

Unnamed: 0,text,mentions_count,hashtags_count,word_count,character_count,avg_word_len,stopwords,pos,ner
0,RT @rssurjewala: Critical question: Was PayTM ...,1,1,20,144,6.2,7,9,3
1,RT @Hemant_80: Did you vote on #Demonetization...,1,1,11,66,5.090909,4,4,1
2,"RT @roshankar: Former FinSec, RBI Dy Governor,...",1,1,21,138,5.571429,5,6,4
3,RT @ANI_news: Gurugram (Haryana): Post office ...,1,1,16,140,7.75,2,10,1
4,RT @satishacharya: Reddy Wedding! @mail_today ...,2,2,9,107,11.0,0,6,2
5,@DerekScissors1: Indias #demonetization: #Bla...,2,2,12,121,9.166667,4,6,2
6,RT @gauravcsawant: Rs 40 lakh looted from a ba...,1,1,22,143,5.5,11,11,3
7,RT @Joydeep_911: Calling all Nationalists to j...,1,2,18,139,6.722222,8,7,2
8,RT @sumitbhati2002: Many opposition leaders ar...,2,1,17,139,7.176471,8,10,1
9,National reform now destroyed even the essence...,0,1,18,140,6.777778,7,10,3


In [48]:
# Describe
df['ner'].describe()

Unnamed: 0,ner
count,1000.0
mean,2.269
std,1.464508
min,0.0
25%,1.0
50%,2.0
75%,3.0
max,8.0
