# Dealing with Natural Language (Text)
We will use the Tweets dataset from Kaggle. The tweets are from US customers about their air travel experience.

In [41]:
import warnings
warnings.filterwarnings("ignore")

In [42]:
#Install NLTK (Natural Language ToolKit)
# !pip install nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
import nltk
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')

In [43]:
import pandas as pd
import numpy as np
import csv

In [None]:
# display all columns
#pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)
#pd.set_option('display.max_colwidth', None)

In [44]:
df = pd.read_csv("/Tweets.csv")

In [45]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0.0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0.0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0.0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0.0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0.0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [46]:
#Column ['text'] contains the tweets
print(np.random.choice(df['text'], 5)) #5 random tweets

["@SouthwestAir where are our pilots? Plane is here flight says it's still on time, but we should of been off the ground 15 minutes ago."
 "@SouthwestAir I always brag about ur service, but very disappointed today. Apparently I'll be sleeping on floor of Dallas airport tonight 👎"
 '@AmericanAir thanks for responding ... will do!'
 "@united once united's service levels reaches those of Etihad or lets be more realistically Lufthansa then ill consider it again"
 '@USAirways Easily the most ridiculous experience trying to spend money with your company.']


In [None]:
#Types of tokenization: a) by word b) by sentence

In [47]:
#Word Tokenization in a naive way
df.loc[:5,'text'].str.split(' ')
# You can see that some words are in uppercase. Perhaps, we should put everything in lowercase

0             [@VirginAmerica, What, @dhepburn, said.]
1    [@VirginAmerica, plus, you've, added, commerci...
2    [@VirginAmerica, I, didn't, today..., Must, me...
3    [@VirginAmerica, it's, really, aggressive, to,...
4    [@VirginAmerica, and, it's, a, really, big, ba...
5    [@VirginAmerica, seriously, would, pay, $30, a...
Name: text, dtype: object

In [48]:
df.loc[:5,'text'].str.lower().str.split(' ')# Next, we will remove special characters like @,

0             [@virginamerica, what, @dhepburn, said.]
1    [@virginamerica, plus, you've, added, commerci...
2    [@virginamerica, i, didn't, today..., must, me...
3    [@virginamerica, it's, really, aggressive, to,...
4    [@virginamerica, and, it's, a, really, big, ba...
5    [@virginamerica, seriously, would, pay, $30, a...
Name: text, dtype: object

In [49]:
df.loc[:5,'text'].str.replace('@','').str.lower().str.split(' ')

#How many issues will you resolve this way? There are quotes(single & double),
#other special characters like exclamation marks, hashtags, etc.
#Note that the last word is having a '.' at the end.
#There are so many trivial issues that need to be handled

0               [virginamerica, what, dhepburn, said.]
1    [virginamerica, plus, you've, added, commercia...
2    [virginamerica, i, didn't, today..., must, mea...
3    [virginamerica, it's, really, aggressive, to, ...
4    [virginamerica, and, it's, a, really, big, bad...
5    [virginamerica, seriously, would, pay, $30, a,...
Name: text, dtype: object

In [50]:
# NLTK provides a fuction word_tokenize which can take care of most of such issues.
for each in df.loc[:5,'text'].str.lower():
    print(nltk.word_tokenize(each))

['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.']
['@', 'virginamerica', 'plus', 'you', "'ve", 'added', 'commercials', 'to', 'the', 'experience', '...', 'tacky', '.']
['@', 'virginamerica', 'i', 'did', "n't", 'today', '...', 'must', 'mean', 'i', 'need', 'to', 'take', 'another', 'trip', '!']
['@', 'virginamerica', 'it', "'s", 'really', 'aggressive', 'to', 'blast', 'obnoxious', '``', 'entertainment', "''", 'in', 'your', 'guests', "'", 'faces', '&', 'amp', ';', 'they', 'have', 'little', 'recourse']
['@', 'virginamerica', 'and', 'it', "'s", 'a', 'really', 'big', 'bad', 'thing', 'about', 'it']
['@', 'virginamerica', 'seriously', 'would', 'pay', '$', '30', 'a', 'flight', 'for', 'seats', 'that', 'did', "n't", 'have', 'this', 'playing', '.', 'it', "'s", 'really', 'the', 'only', 'bad', 'thing', 'about', 'flying', 'va']


# Removing Stopwords
> Stopwords are words that you want to ignore, so you filter them out of your text when you’re processing it.
> Very common words like 'in', 'is', and 'an' are often used as stop words since they don’t add a lot of meaning to a text in and of themselves.


In [51]:
#Next step is to remove the stopwords (words that don't carry a semantic importance)

#Let's fetch the English language stopwords
from nltk.corpus import stopwords
sw_list = set(stopwords.words('english'))
print(sw_list)

#It can be seen that there are no special characters/symbols in the list of stopwords
#Let's extend it

{'don', 'y', 'to', 'some', 'most', 'its', 'all', "hadn't", "you'll", 'does', 'being', 'other', "won't", 'under', 'so', "hasn't", 'am', 'itself', 'shan', 'couldn', 'theirs', "that'll", 'just', 'won', 'herself', 'was', 'that', 'own', "doesn't", 'from', 'through', 'our', 'she', 'into', 'why', 'mightn', 'an', 'o', "don't", 'with', 've', 'before', 'in', 'hasn', "weren't", 'or', 'too', "mustn't", 'ourselves', 'during', 'haven', 'what', 'ma', 'by', 'hers', 'them', 'again', 'are', 'those', 'because', 'after', 'very', 'shouldn', 'is', 'were', 'such', 'me', "aren't", 'when', "isn't", 'out', 'now', 'at', 'for', 'they', 'over', 'above', "haven't", 'his', 'hadn', 'ain', 'weren', 'have', 'more', 'it', 'who', 'him', 'below', 'doing', 'than', 'has', 'which', 'themselves', 'do', 'the', 'needn', 'about', 'wasn', 'each', "needn't", 'your', 'how', 'while', 'nor', 'this', "should've", "she's", 'any', 'you', 'having', 'he', 'should', 'few', "you'd", 'll', 'my', 'whom', 'can', 'ours', 'mustn', 's', 'her', 'w

In [52]:
sw_list.update(['@',"'",'.','"','/','!',',',"'ve","...","n't",'$',"'s"])
print(sw_list)

{'don', 'y', 'to', 'some', 'most', 'its', '@', 'all', "hadn't", ',', "you'll", 'does', 'being', 'other', "won't", 'under', 'so', "hasn't", 'am', 'itself', 'shan', 'couldn', 'theirs', "that'll", 'just', 'won', 'herself', '!', 'was', 'that', 'own', "doesn't", 'from', 'through', 'our', 'she', 'into', 'why', 'mightn', 'an', 'o', "don't", 'with', 've', 'before', 'in', 'hasn', "weren't", 'or', 'too', "mustn't", 'ourselves', 'during', 'haven', 'what', 'ma', 'by', 'hers', 'them', '.', 'again', 'are', 'those', 'because', 'after', 'very', 'shouldn', 'is', 'were', 'such', 'me', "aren't", 'when', "isn't", 'out', 'now', 'at', 'for', 'they', 'over', 'above', '/', "haven't", 'his', 'hadn', 'ain', 'weren', 'have', 'more', 'it', 'who', 'him', 'below', 'doing', 'than', 'has', 'which', 'themselves', '"', 'do', 'the', 'needn', 'about', 'wasn', 'each', "needn't", 'your', 'how', 'while', 'nor', 'this', "should've", "she's", 'any', 'you', 'having', "'", 'he', 'should', 'few', "you'd", 'll', 'my', "n't", 'who

In [53]:
tokenized_data = []
for each in df.loc[:2,'text'].str.lower():
    tokenized_data.append(nltk.word_tokenize(each))
print(tokenized_data)

[['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.'], ['@', 'virginamerica', 'plus', 'you', "'ve", 'added', 'commercials', 'to', 'the', 'experience', '...', 'tacky', '.'], ['@', 'virginamerica', 'i', 'did', "n't", 'today', '...', 'must', 'mean', 'i', 'need', 'to', 'take', 'another', 'trip', '!']]


In [54]:
data = []
for line in tokenized_data:
    #print(line)
    processed_line = []
    for word in line:
        if word not in sw_list:
            processed_line.append(word)
    data.append(processed_line)
print(data)
#We are left with only semantically meanigful words
#Depending on the output, more stopwords can be added to the sw_list above

[['virginamerica', 'dhepburn', 'said'], ['virginamerica', 'plus', 'added', 'commercials', 'experience', 'tacky'], ['virginamerica', 'today', 'must', 'mean', 'need', 'take', 'another', 'trip']]


# Sentence Tokenization

In [55]:
#For some tasks, you might need to tokenize the data into sentences
#Sentence tokenization in a naive way
df.loc[5,'text'].lower().split('.')

["@virginamerica seriously would pay $30 a flight for seats that didn't have this playing",
 "\nit's really the only bad thing about flying va"]

In [56]:
#ALternative: Using sentence tokenizer from NLTK
nltk.sent_tokenize(df.loc[5,'text'].lower())

["@virginamerica seriously would pay $30 a flight for seats that didn't have this playing.",
 "it's really the only bad thing about flying va"]

# Stemming
Reduce words to their root, which is the core part of a word. Take note that the core part ('root') may not be a complete English word.

For example, the words “helping” and “helped” share the root “help.”

> There are two stemmers available in NLTK, PorterStemmer() and SnowballStemmer(). The Snowball stemmer, which is also called Porter2, is an improvement on the original Porter stemmer.

In [57]:
from nltk.stem import PorterStemmer
p_stemmer = PorterStemmer()

print(p_stemmer.stem('help'))
print(p_stemmer.stem('helped'))
print(p_stemmer.stem('helping'))

help
help
help


In [58]:
from nltk.stem import SnowballStemmer
s_stemmer = SnowballStemmer('english')

print(s_stemmer.stem('help'))
print(s_stemmer.stem('helped'))
print(s_stemmer.stem('helping'))

help
help
help


In [59]:
#Let's apply stemming on a subset of data and see the differences in the two stemmers
strings_for_stemming = df.loc[:100,'text'].str.lower()
strings_for_stemming

0                    @virginamerica what @dhepburn said.
1      @virginamerica plus you've added commercials t...
2      @virginamerica i didn't today... must mean i n...
3      @virginamerica it's really aggressive to blast...
4      @virginamerica and it's a really big bad thing...
                             ...                        
96     @virginamerica i can't check in or add a bag. ...
97     @virginamerica - let 2 scanned in passengers l...
98     @virginamerica what is your phone number. i ca...
99     @virginamerica is anyone doing anything there ...
100    @virginamerica trying to add my boy prince to ...
Name: text, Length: 101, dtype: object

In [60]:
from nltk import word_tokenize

words_in_each_string = []
for each_string in strings_for_stemming:
    words_in_each_string.append(word_tokenize(each_string))

words_in_each_string

[['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.'],
 ['@',
  'virginamerica',
  'plus',
  'you',
  "'ve",
  'added',
  'commercials',
  'to',
  'the',
  'experience',
  '...',
  'tacky',
  '.'],
 ['@',
  'virginamerica',
  'i',
  'did',
  "n't",
  'today',
  '...',
  'must',
  'mean',
  'i',
  'need',
  'to',
  'take',
  'another',
  'trip',
  '!'],
 ['@',
  'virginamerica',
  'it',
  "'s",
  'really',
  'aggressive',
  'to',
  'blast',
  'obnoxious',
  '``',
  'entertainment',
  "''",
  'in',
  'your',
  'guests',
  "'",
  'faces',
  '&',
  'amp',
  ';',
  'they',
  'have',
  'little',
  'recourse'],
 ['@',
  'virginamerica',
  'and',
  'it',
  "'s",
  'a',
  'really',
  'big',
  'bad',
  'thing',
  'about',
  'it'],
 ['@',
  'virginamerica',
  'seriously',
  'would',
  'pay',
  '$',
  '30',
  'a',
  'flight',
  'for',
  'seats',
  'that',
  'did',
  "n't",
  'have',
  'this',
  'playing',
  '.',
  'it',
  "'s",
  'really',
  'the',
  'only',
  'bad',
  'thing',
  'about',


In [61]:
original_words_list = []
p_stemmed_list = []
s_stemmed_list = []

for each_list in words_in_each_string:
    print(each_list)
    for word in each_list:
        original_words_list.append(word)
        p_stemmed_list.append(p_stemmer.stem(word))
        s_stemmed_list.append(s_stemmer.stem(word))

['@', 'virginamerica', 'what', '@', 'dhepburn', 'said', '.']
['@', 'virginamerica', 'plus', 'you', "'ve", 'added', 'commercials', 'to', 'the', 'experience', '...', 'tacky', '.']
['@', 'virginamerica', 'i', 'did', "n't", 'today', '...', 'must', 'mean', 'i', 'need', 'to', 'take', 'another', 'trip', '!']
['@', 'virginamerica', 'it', "'s", 'really', 'aggressive', 'to', 'blast', 'obnoxious', '``', 'entertainment', "''", 'in', 'your', 'guests', "'", 'faces', '&', 'amp', ';', 'they', 'have', 'little', 'recourse']
['@', 'virginamerica', 'and', 'it', "'s", 'a', 'really', 'big', 'bad', 'thing', 'about', 'it']
['@', 'virginamerica', 'seriously', 'would', 'pay', '$', '30', 'a', 'flight', 'for', 'seats', 'that', 'did', "n't", 'have', 'this', 'playing', '.', 'it', "'s", 'really', 'the', 'only', 'bad', 'thing', 'about', 'flying', 'va']
['@', 'virginamerica', 'yes', ',', 'nearly', 'every', 'time', 'i', 'fly', 'vx', 'this', '“', 'ear', 'worm', '”', 'won', '’', 't', 'go', 'away', ':', ')']
['@', 'virgin

In [62]:
data = pd.DataFrame(original_words_list, columns = ['Original Word'])
data['Porter_Stemming'] = p_stemmed_list
data['Snowball_Stemming'] = s_stemmed_list

data

Unnamed: 0,Original Word,Porter_Stemming,Snowball_Stemming
0,@,@,@
1,virginamerica,virginamerica,virginamerica
2,what,what,what
3,@,@,@
4,dhepburn,dhepburn,dhepburn
...,...,...,...
2061,from,from,from
2062,lax,lax,lax
2063,http,http,http
2064,:,:,:


In [63]:
#Let's look at only the unique words
data.drop_duplicates(inplace = True)
data
#You can see that 672 words are only remaining in the 'data' dataframe!

Unnamed: 0,Original Word,Porter_Stemming,Snowball_Stemming
0,@,@,@
1,virginamerica,virginamerica,virginamerica
2,what,what,what
4,dhepburn,dhepburn,dhepburn
5,said,said,said
...,...,...,...
2051,prince,princ,princ
2054,ressie,ressi,ressi
2056,sf,sf,sf
2058,thursday,thursday,thursday


In [64]:
#Let's see on which words the stemmers give different results!
data.loc[data['Porter_Stemming'] != data['Snowball_Stemming']]

# You can observe that Snowball_Stemming is more accurate.
# Also, observe that the root of some of the words is an incomplete English word.

Unnamed: 0,Original Word,Porter_Stemming,Snowball_Stemming
9,plus,plu,plus
11,'ve,'ve,ve
87,this,thi,this
102,yes,ye,yes
104,nearly,nearli,near
137,https,http,https
155,was,wa,was
165,'re,'re,re
351,amazingly,amazingli,amaz
428,bos,bo,bos


# Lemmatization
Stemming is a process that stems or removes last few characters from a word, often leading to incorrect meanings and spelling. Lemmatization considers the context and converts the word to its meaningful base form, which is called Lemma.

> For instance, stemming the word 'amazed' would return 'amaz'. However, lemmatizing the word 'amazed' would give 'amaze'.

In [65]:
print(p_stemmer.stem('amazed'))
print(s_stemmer.stem('amazed'))

amaz
amaz


In [66]:
# nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('amazed')

'amazed'

In [32]:
# Note the difference in the results!
lemmatizer.lemmatize('amazed',pos = 'v')

# Lemmatization needs the Parts of Speech to generate the correct lemma. The default pos is 'n' (Noun).
# 'v' stands for verb.

'amaze'

# Tagging Parts of Speech (PoS)

In [67]:
import nltk

text = df.loc[5,'text']
print(text)

tokens_list = nltk.word_tokenize(text)
tokens_list

@VirginAmerica seriously would pay $30 a flight for seats that didn't have this playing.
it's really the only bad thing about flying VA


['@',
 'VirginAmerica',
 'seriously',
 'would',
 'pay',
 '$',
 '30',
 'a',
 'flight',
 'for',
 'seats',
 'that',
 'did',
 "n't",
 'have',
 'this',
 'playing',
 '.',
 'it',
 "'s",
 'really',
 'the',
 'only',
 'bad',
 'thing',
 'about',
 'flying',
 'VA']

In [69]:
# nltk.download('averaged_perceptron_tagger')
tokens_after_SW = []
for word in tokens_list:
        if word not in sw_list:
            tokens_after_SW.append(word)

tagged_tokens = nltk.pos_tag(tokens_after_SW) #Tagging each word with PoS
tagged_tokens

[('VirginAmerica', 'NNP'),
 ('seriously', 'RB'),
 ('would', 'MD'),
 ('pay', 'VB'),
 ('30', 'CD'),
 ('flight', 'NN'),
 ('seats', 'NNS'),
 ('playing', 'VBG'),
 ('really', 'RB'),
 ('bad', 'JJ'),
 ('thing', 'NN'),
 ('flying', 'VBG'),
 ('VA', 'NNP')]

In [70]:
#Refer to the complete list of tags here!
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

FYI, NLTK has another Tagger: WordNet Tagger. You can explore it in your free time.

In [71]:
# Define a function to get the POS tag for lemmatization
def get_pos(tag):
    if tag.startswith('J'):
        return 'a' #Adjective
    elif tag.startswith('V'):
        return 'v' #Verb
    elif tag.startswith('N'):
        return 'n' #Noun
    elif tag.startswith('R'):
        return 'r' #Adverb
    else:
        return 'n'

In [72]:
#Lemmatization on a sample string. Stopwords have been removed.
for token, tag in tagged_tokens:
    print("Word:",token,"\nLemmatized Word:",lemmatizer.lemmatize(token.lower(),pos = get_pos(tag)))
    print('***********************')

Word: VirginAmerica 
Lemmatized Word: virginamerica
***********************
Word: seriously 
Lemmatized Word: seriously
***********************
Word: would 
Lemmatized Word: would
***********************
Word: pay 
Lemmatized Word: pay
***********************
Word: 30 
Lemmatized Word: 30
***********************
Word: flight 
Lemmatized Word: flight
***********************
Word: seats 
Lemmatized Word: seat
***********************
Word: playing 
Lemmatized Word: play
***********************
Word: really 
Lemmatized Word: really
***********************
Word: bad 
Lemmatized Word: bad
***********************
Word: thing 
Lemmatized Word: thing
***********************
Word: flying 
Lemmatized Word: fly
***********************
Word: VA 
Lemmatized Word: va
***********************
