# Activity 2: Extracting general features from text

In [1]:
import pandas as pd
import nltk
from nltk.tag import pos_tag
from nltk import word_tokenize
from collections import Counter
import string
from textblob import TextBlob

In [43]:
df = pd.read_csv('data/data.csv')
df

Unnamed: 0,text
0,What you do defines you
1,Your deeds define you
2,Once upon a time there lived a king.
3,Who is your queen?
4,He is desperate
5,Is he not desperate?
6,Data Science is an overlap between Arts and Sc...
7,"Generally, Arts graduates are right-brained an..."
8,Excelling in both Arts and Science at a time b...
9,Natural Language Processing is a part of Data ...


# Finding the number of occurences of each part of speech

In [44]:
def count_pos(sentence):
    tags = pos_tag(word_tokenize(sentence))
    return Counter([j for i,j in tags])

In [45]:
df['pos tagging'] = df['text'].apply(lambda x: count_pos(str(x)))

In [46]:
df

Unnamed: 0,text,pos tagging
0,What you do defines you,"{'WP': 1, 'PRP': 2, 'VBP': 1, 'NNS': 1}"
1,Your deeds define you,"{'PRP$': 1, 'NNS': 1, 'VBP': 1, 'PRP': 1}"
2,Once upon a time there lived a king.,"{'RB': 2, 'IN': 1, 'DT': 2, 'NN': 2, 'VBD': 1,..."
3,Who is your queen?,"{'WP': 1, 'VBZ': 1, 'PRP$': 1, 'NN': 1, '.': 1}"
4,He is desperate,"{'PRP': 1, 'VBZ': 1, 'JJ': 1}"
5,Is he not desperate?,"{'VBZ': 1, 'PRP': 1, 'RB': 1, 'VB': 1, '.': 1}"
6,Data Science is an overlap between Arts and Sc...,"{'NNP': 3, 'VBZ': 1, 'DT': 1, 'NN': 1, 'IN': 1..."
7,"Generally, Arts graduates are right-brained an...","{'RB': 1, ',': 1, 'NNS': 3, 'VBP': 2, 'JJ': 2,..."
8,Excelling in both Arts and Science at a time b...,"{'VBG': 1, 'IN': 2, 'DT': 2, 'NNS': 1, 'CC': 1..."
9,Natural Language Processing is a part of Data ...,"{'JJ': 1, 'NNP': 4, 'VBZ': 1, 'DT': 1, 'NN': 1..."


# Find the amout of punctuation marks

In [47]:
punc = set(string.punctuation)
print(punc)

{'`', '%', '_', '}', '*', '?', '@', '|', '=', '~', '.', ')', '[', '^', '<', '-', '"', '\\', '$', '/', "'", '+', '{', ';', ']', '#', ',', ':', '>', '(', '&', '!'}


In [48]:
df['number of puncuations'] = df['text'].apply(lambda x: len(set(x).intersection(punc)) )

In [49]:
df

Unnamed: 0,text,pos tagging,number of puncuations
0,What you do defines you,"{'WP': 1, 'PRP': 2, 'VBP': 1, 'NNS': 1}",0
1,Your deeds define you,"{'PRP$': 1, 'NNS': 1, 'VBP': 1, 'PRP': 1}",0
2,Once upon a time there lived a king.,"{'RB': 2, 'IN': 1, 'DT': 2, 'NN': 2, 'VBD': 1,...",1
3,Who is your queen?,"{'WP': 1, 'VBZ': 1, 'PRP$': 1, 'NN': 1, '.': 1}",1
4,He is desperate,"{'PRP': 1, 'VBZ': 1, 'JJ': 1}",0
5,Is he not desperate?,"{'VBZ': 1, 'PRP': 1, 'RB': 1, 'VB': 1, '.': 1}",1
6,Data Science is an overlap between Arts and Sc...,"{'NNP': 3, 'VBZ': 1, 'DT': 1, 'NN': 1, 'IN': 1...",1
7,"Generally, Arts graduates are right-brained an...","{'RB': 1, ',': 1, 'NNS': 3, 'VBP': 2, 'JJ': 2,...",3
8,Excelling in both Arts and Science at a time b...,"{'VBG': 1, 'IN': 2, 'DT': 2, 'NNS': 1, 'CC': 1...",1
9,Natural Language Processing is a part of Data ...,"{'JJ': 1, 'NNP': 4, 'VBZ': 1, 'DT': 1, 'NN': 1...",1


# Find the amount of upper case and lowercase words

In [50]:
df['num_uppercase_words'] = df['text'].apply(lambda x: len([word for word in x.split() if word[0].isupper()]))

In [51]:
df['num_lowercase_words'] = df['text'].apply(lambda x: len([word for word in x.split() if word[0].islower()]))

In [52]:
df

Unnamed: 0,text,pos tagging,number of puncuations,num_uppercase_words,num_lowercase_words
0,What you do defines you,"{'WP': 1, 'PRP': 2, 'VBP': 1, 'NNS': 1}",0,1,4
1,Your deeds define you,"{'PRP$': 1, 'NNS': 1, 'VBP': 1, 'PRP': 1}",0,1,3
2,Once upon a time there lived a king.,"{'RB': 2, 'IN': 1, 'DT': 2, 'NN': 2, 'VBD': 1,...",1,1,7
3,Who is your queen?,"{'WP': 1, 'VBZ': 1, 'PRP$': 1, 'NN': 1, '.': 1}",1,1,3
4,He is desperate,"{'PRP': 1, 'VBZ': 1, 'JJ': 1}",0,1,2
5,Is he not desperate?,"{'VBZ': 1, 'PRP': 1, 'RB': 1, 'VB': 1, '.': 1}",1,1,3
6,Data Science is an overlap between Arts and Sc...,"{'NNP': 3, 'VBZ': 1, 'DT': 1, 'NN': 1, 'IN': 1...",1,4,5
7,"Generally, Arts graduates are right-brained an...","{'RB': 1, ',': 1, 'NNS': 3, 'VBP': 2, 'JJ': 2,...",3,3,7
8,Excelling in both Arts and Science at a time b...,"{'VBG': 1, 'IN': 2, 'DT': 2, 'NNS': 1, 'CC': 1...",1,3,8
9,Natural Language Processing is a part of Data ...,"{'JJ': 1, 'NNP': 4, 'VBZ': 1, 'DT': 1, 'NN': 1...",1,5,4


# Number of letters

In [53]:
# number of letter can simply be num of upper case + num of lower case 
# df['num_letters'] = df['upper_case'] + df['lower_case']

# by using df.apply 
df['num_letters'] = df['text'].apply(lambda x: len([ch for ch in str(x) if ch.isalpha()]))

In [54]:
df

Unnamed: 0,text,pos tagging,number of puncuations,num_uppercase_words,num_lowercase_words,num_letters
0,What you do defines you,"{'WP': 1, 'PRP': 2, 'VBP': 1, 'NNS': 1}",0,1,4,19
1,Your deeds define you,"{'PRP$': 1, 'NNS': 1, 'VBP': 1, 'PRP': 1}",0,1,3,18
2,Once upon a time there lived a king.,"{'RB': 2, 'IN': 1, 'DT': 2, 'NN': 2, 'VBD': 1,...",1,1,7,28
3,Who is your queen?,"{'WP': 1, 'VBZ': 1, 'PRP$': 1, 'NN': 1, '.': 1}",1,1,3,14
4,He is desperate,"{'PRP': 1, 'VBZ': 1, 'JJ': 1}",0,1,2,13
5,Is he not desperate?,"{'VBZ': 1, 'PRP': 1, 'RB': 1, 'VB': 1, '.': 1}",1,1,3,16
6,Data Science is an overlap between Arts and Sc...,"{'NNP': 3, 'VBZ': 1, 'DT': 1, 'NN': 1, 'IN': 1...",1,4,5,43
7,"Generally, Arts graduates are right-brained an...","{'RB': 1, ',': 1, 'NNS': 3, 'VBP': 2, 'JJ': 2,...",3,3,7,70
8,Excelling in both Arts and Science at a time b...,"{'VBG': 1, 'IN': 2, 'DT': 2, 'NNS': 1, 'CC': 1...",1,3,8,52
9,Natural Language Processing is a part of Data ...,"{'JJ': 1, 'NNP': 4, 'VBZ': 1, 'DT': 1, 'NN': 1...",1,5,4,45


# Number of digits

In [60]:
df['num_digits'] = df['text'].apply(lambda x: len([ch for ch in x if ch.isdigit()]))

In [64]:
df

Unnamed: 0,text,pos tagging,number of puncuations,num_uppercase_words,num_lowercase_words,num_letters,num_digits
0,What you do defines you,"{'WP': 1, 'PRP': 2, 'VBP': 1, 'NNS': 1}",0,1,4,19,0
1,Your deeds define you,"{'PRP$': 1, 'NNS': 1, 'VBP': 1, 'PRP': 1}",0,1,3,18,0
2,Once upon a time there lived a king.,"{'RB': 2, 'IN': 1, 'DT': 2, 'NN': 2, 'VBD': 1,...",1,1,7,28,0
3,Who is your queen?,"{'WP': 1, 'VBZ': 1, 'PRP$': 1, 'NN': 1, '.': 1}",1,1,3,14,0
4,He is desperate,"{'PRP': 1, 'VBZ': 1, 'JJ': 1}",0,1,2,13,0
5,Is he not desperate?,"{'VBZ': 1, 'PRP': 1, 'RB': 1, 'VB': 1, '.': 1}",1,1,3,16,0
6,Data Science is an overlap between Arts and Sc...,"{'NNP': 3, 'VBZ': 1, 'DT': 1, 'NN': 1, 'IN': 1...",1,4,5,43,0
7,"Generally, Arts graduates are right-brained an...","{'RB': 1, ',': 1, 'NNS': 3, 'VBP': 2, 'JJ': 2,...",3,3,7,70,0
8,Excelling in both Arts and Science at a time b...,"{'VBG': 1, 'IN': 2, 'DT': 2, 'NNS': 1, 'CC': 1...",1,3,8,52,0
9,Natural Language Processing is a part of Data ...,"{'JJ': 1, 'NNP': 4, 'VBZ': 1, 'DT': 1, 'NN': 1...",1,5,4,45,0


# Amount of words

In [65]:
df['num_words'] = df['text'].apply(lambda x: len(x.split()))

# it could also be 
# df['num_words'] = df['num_uppercase_words'] + df['num_lowercase_words']

In [66]:
df

Unnamed: 0,text,pos tagging,number of puncuations,num_uppercase_words,num_lowercase_words,num_letters,num_digits,num_words
0,What you do defines you,"{'WP': 1, 'PRP': 2, 'VBP': 1, 'NNS': 1}",0,1,4,19,0,5
1,Your deeds define you,"{'PRP$': 1, 'NNS': 1, 'VBP': 1, 'PRP': 1}",0,1,3,18,0,4
2,Once upon a time there lived a king.,"{'RB': 2, 'IN': 1, 'DT': 2, 'NN': 2, 'VBD': 1,...",1,1,7,28,0,8
3,Who is your queen?,"{'WP': 1, 'VBZ': 1, 'PRP$': 1, 'NN': 1, '.': 1}",1,1,3,14,0,4
4,He is desperate,"{'PRP': 1, 'VBZ': 1, 'JJ': 1}",0,1,2,13,0,3
5,Is he not desperate?,"{'VBZ': 1, 'PRP': 1, 'RB': 1, 'VB': 1, '.': 1}",1,1,3,16,0,4
6,Data Science is an overlap between Arts and Sc...,"{'NNP': 3, 'VBZ': 1, 'DT': 1, 'NN': 1, 'IN': 1...",1,4,5,43,0,9
7,"Generally, Arts graduates are right-brained an...","{'RB': 1, ',': 1, 'NNS': 3, 'VBP': 2, 'JJ': 2,...",3,3,7,70,0,10
8,Excelling in both Arts and Science at a time b...,"{'VBG': 1, 'IN': 2, 'DT': 2, 'NNS': 1, 'CC': 1...",1,3,8,52,0,11
9,Natural Language Processing is a part of Data ...,"{'JJ': 1, 'NNP': 4, 'VBZ': 1, 'DT': 1, 'NN': 1...",1,5,4,45,0,9


# Number of white spaces

In [71]:
df['num_whitespaces'] = df['text'].apply(lambda x: len(x.split()) - 1)

# df['num_whitespaces'] = df['text'].apply(lambda x: len([ch for ch in x if ch == ' ']))

In [73]:
df.head()

Unnamed: 0,text,pos tagging,number of puncuations,num_uppercase_words,num_lowercase_words,num_letters,num_digits,num_words,num_whitespaces
0,What you do defines you,"{'WP': 1, 'PRP': 2, 'VBP': 1, 'NNS': 1}",0,1,4,19,0,5,4
1,Your deeds define you,"{'PRP$': 1, 'NNS': 1, 'VBP': 1, 'PRP': 1}",0,1,3,18,0,4,3
2,Once upon a time there lived a king.,"{'RB': 2, 'IN': 1, 'DT': 2, 'NN': 2, 'VBD': 1,...",1,1,7,28,0,8,7
3,Who is your queen?,"{'WP': 1, 'VBZ': 1, 'PRP$': 1, 'NN': 1, '.': 1}",1,1,3,14,0,4,3
4,He is desperate,"{'PRP': 1, 'VBZ': 1, 'JJ': 1}",0,1,2,13,0,3,2
