In [2]:
import numpy as np
import re
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd 

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 5000)

In [4]:
def readdf(path):
    any_df = pd.read_excel(path)
    return any_df

def writedf2xlsx(any_df, name):
    from pandas import ExcelWriter
    writer = ExcelWriter(name + '.xlsx')
    any_df.to_excel(writer,'Sheet1')
    writer.save()

In [5]:
text1 = "For whomsoever shall hold this hammer, if he shall be worthy shall possess the Power of Thor" 
text2 = "Did you do it ? yeS"
text3 = "What dId it coSt ? EveryThing"
text4 = text2 +' '+ text3

In [6]:
print(text1, text2, text3, text4, sep="\n")

For whomsoever shall hold this hammer, if he shall be worthy shall possess the Power of Thor
Did you do it ? yeS
What dId it coSt ? EveryThing
Did you do it ? yeS What dId it coSt ? EveryThing


In [7]:
print(len(text1), len(text2), len(text3), len(text4), sep="\n")

92
19
29
49


### Split the sentence in words - How ? 
### Split them by a delimiter. CSV(comma separated value) files are read by discarding the *commas* in between data, same way these words are separated by whitespace.

https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html

In [8]:
text1a = text1.split(' ')
text4a = text4.split(' ')
print(len(text1a), len(text4a))
print(text1a, text4a, sep='\n')

17 12
['For', 'whomsoever', 'shall', 'hold', 'this', 'hammer,', 'if', 'he', 'shall', 'be', 'worthy', 'shall', 'possess', 'the', 'Power', 'of', 'Thor']
['Did', 'you', 'do', 'it', '?', 'yeS', 'What', 'dId', 'it', 'coSt', '?', 'EveryThing']


### Now the words are tokenized(chopping sentence into pieces/tokens). This O/P is then fed as I/P for Parsing or Text Mining

# String Operations

## Remember List Comprehension from C1 - Iterate through collections in one line

In [9]:
# Long words: Words that have more than 3 characters
[w for w in text1a if len(w)>3] 
# Long words: Words that have less than 3 characters
[w for w in text1a if len(w)<3] 

['if', 'he', 'be', 'of']

In [10]:
print([w for w in text1a if w.istitle()])              # Words that have 1st character in Capital
print([w for w in text1a if w.endswith('r')])          # Words that end with character 'r'

['For', 'Power', 'Thor']
['For', 'whomsoever', 'Power', 'Thor']


### unique words in a list of words using set() function

In [11]:
print("Words that are unique without preprocessing (1 '?', 1 'it', 2 'Did & dId'")
print(set(text4a))
print("\nWords that are unique with preprocessing (1 '?', 1 'it', 1 'did'")
print(set([w.lower() for w in text4a]))

Words that are unique without preprocessing (1 '?', 1 'it', 2 'Did & dId'
{'coSt', 'What', 'Did', 'EveryThing', '?', 'you', 'it', 'dId', 'do', 'yeS'}

Words that are unique with preprocessing (1 '?', 1 'it', 1 'did'
{'what', 'cost', '?', 'you', 'it', 'everything', 'yes', 'do', 'did'}


### Convert all words to lowercase || uppercase || title

In [12]:
print('Uppercase : ', [w.upper() for w in text4a], '\n') # Convert all words to lowercase
print('Lowercase : ', [w.lower() for w in text4a], '\n') # Convert all words to uppercase
print('Title     : ', [w.title() for w in text4a], '\n') # Convert all words to title(1st letter caps)

Uppercase :  ['DID', 'YOU', 'DO', 'IT', '?', 'YES', 'WHAT', 'DID', 'IT', 'COST', '?', 'EVERYTHING'] 

Lowercase :  ['did', 'you', 'do', 'it', '?', 'yes', 'what', 'did', 'it', 'cost', '?', 'everything'] 

Title     :  ['Did', 'You', 'Do', 'It', '?', 'Yes', 'What', 'Did', 'It', 'Cost', '?', 'Everything'] 



### Splitting a sentence based a word(stopper word/connectors)

In [13]:
print(text1)
print(text1.split('shall'), '\n')  

For whomsoever shall hold this hammer, if he shall be worthy shall possess the Power of Thor
['For whomsoever ', ' hold this hammer, if he ', ' be worthy ', ' possess the Power of Thor'] 



### Merging 3 sentences with a Connector (and/will/hence etc) using .join() function

In [14]:
print('shall'.join(text1.split('shall')))
print('will'.join(text1.split('shall')))
print('should'.join(text1.split('shall')))

For whomsoever shall hold this hammer, if he shall be worthy shall possess the Power of Thor
For whomsoever will hold this hammer, if he will be worthy will possess the Power of Thor
For whomsoever should hold this hammer, if he should be worthy should possess the Power of Thor


In [15]:
temp = "ouagadougou"
print("split a string - ouagadougou(eg:sentence/word) into substring - ou(eg:words):")
print("ouagadougou".split('ou')) 

split a string - ouagadougou(eg:sentence/word) into substring - ou(eg:words):
['', 'agad', 'g', '']


In [16]:
print("\nsplit a substring - ouagadougou(eg:word) into separate characters:")
print("Method 1", list("ouagadougou"))       
print("Method 2", [c for c in "ouagadougou"])


split a substring - ouagadougou(eg:word) into separate characters:
Method 1 ['o', 'u', 'a', 'g', 'a', 'd', 'o', 'u', 'g', 'o', 'u']
Method 2 ['o', 'u', 'a', 'g', 'a', 'd', 'o', 'u', 'g', 'o', 'u']


In [17]:
"ouagadougou".split('ou')# Error         --> split a word into character

['', 'agad', 'g', '']

In [18]:
textnick = "      See, it's things like this that give me trust issues.  "
print(textnick.split(' '))

['', '', '', '', '', '', 'See,', "it's", 'things', 'like', 'this', 'that', 'give', 'me', 'trust', 'issues.', '', '']


### Note Below, the disappearance of the whitespaces at the beginning and end of the string

In [19]:
textnick = "      See, it's things like this that give me trust issues.  "
print(textnick.strip().split(' '))

['See,', "it's", 'things', 'like', 'this', 'that', 'give', 'me', 'trust', 'issues.']


## Find first occurance of a substring/character in a String/Sentence
#### To find all occurances, we can use *Regular Expressions* re.find_iter function 
https://docs.python.org/2/library/re.html#module-contents

In [20]:
Movies = "Marvel - Hulk, Marvel - Thor, Marvel - Ironman, Marvel - Captain America, Marvel - Avengers"
print("INDEX of First occurance of the substring Marvel in the sentence Movies from the start", Movies.find('Marvel'))
print("\nINDEX of First occurance of the substring Marvel in the sentence Movies from the end", Movies.rfind('Marvel'))
print("\nAll occurances of Marvel in the sentence is replaced by DC:\n", Movies.replace('Marvel', 'DC'))

INDEX of First occurance of the substring Marvel in the sentence Movies from the start 0

INDEX of First occurance of the substring Marvel in the sentence Movies from the end 74

All occurances of Marvel in the sentence is replaced by DC:
 DC - Hulk, DC - Thor, DC - Ironman, DC - Captain America, DC - Avengers


# Handling larger Texts
## Reading files by line

In [21]:
f = open('Data/UNHDR.txt', 'r')
EOL_space = f.readline()
EOL_space

'Universal Declaration of Human Rights\n'

In [22]:
EOL_space.rstrip()  # To remove the \n character from the end of the line use strip() and its add on versions 

'Universal Declaration of Human Rights'

## Reading the full file

In [23]:
f = open('Data/UNHDR.txt', 'r')
f.seek(0) # Reset the reading pointer to the start
HDRUN = f.read()
print("There are {} sentences in this file".format(len(HDRUN.splitlines())))
HDRUN.splitlines()

There are 210 sentences in this file


['Universal Declaration of Human Rights',
 'Preamble',
 'Whereas recognition of the inherent dignity and of the equal and inalienable',
 'rights of all members of the human family is the foundation of freedom, justice',
 'and peace in the world,',
 'Whereas disregard and contempt for human rights have resulted in barbarous',
 'acts which have outraged the conscience of mankind, and the advent of a world',
 'in which human beings shall enjoy freedom of speech and belief and freedom',
 'from fear and want has been proclaimed as the highest aspiration of the common',
 'people,',
 'Whereas it is essential, if man is not to be compelled to have recourse, as a last',
 'resort, to rebellion against tyranny and oppression, that human rights should be',
 'protected by the rule of law,',
 'Whereas it is essential to promote the development of friendly relations between',
 'nations,',
 'Whereas the peoples of the United Nations have in the Charter reaffirmed their',
 'faith in fundamental human r

## File Operations

In [25]:
# f = open('/Data/filename.txt', mode='r' or 'w') # Open File with said filename in read | write mode
# f.readline() | f.read() | f.read(n)       # 
# for line in f:
#     doSomething(line)
# f.seek(n)
# f.write(message)
# f.close()
# f.closed()

# Most Annoying part of Text preprocessing - Regular Expressions [Regex]
## But also the most important part

###  Python Regular Expressions Made Easy (2017) -->  https://www.youtube.com/playlist?list=PLGKQkV4guDKH1TpfM-FvPGLUyjsPGdXOg

In [26]:
text5 = '"Ethics are built right into the ideals and objectives of the United Nations" \
#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr @UN @UN_Women'
text5a = text5.split(' ')
print(text5a)

['"Ethics', 'are', 'built', 'right', 'into', 'the', 'ideals', 'and', 'objectives', 'of', 'the', 'United', 'Nations"', '#UNSG', '@', 'NY', 'Society', 'for', 'Ethical', 'Culture', 'bit.ly/2guVelr', '@UN', '@UN_Women']


### Extract elements with special symbols (meanings) @ and # are used in tweets

In [27]:
print([w for w in text5a if w.startswith('#')])
print([w for w in text5a if w.startswith('@')])

['#UNSG']
['@', '@UN', '@UN_Women']


In [28]:
import re
[w for w in text5a if re.search('@[A-Za-z0-9_]+', w)]

['@UN', '@UN_Women']

### Meta Characters in Regular Expressions

```
 .       --> Wilcard Character that matches a single character
 ^       --> Start of the String
 $       --> End of the String
 []      --> Matches one of the charaters, that are within the square brackets 
 [a-z]   --> Matches one of the range of characters a,b,c,d,e,f,g......x,y,z
 [^abc]  --> Matches any character except a,b,c - Exclude a,b,c
 [a|b]   --> Matches either a or b, where a & b are strings
 ()      --> Scoping for Operators - just normal use
 \       --> Escape character for special characters (\n,\t,\b,\d, \D)

 Meta Characters: Character Symbols
 \b      --> Matches Word boundary
 \d      --> Any digit, equivalent to [0-9]
 \D      --> Any non-digit, equivalent to [^0-9]
 \s      --> Any whitespace, equivalent to [\t\n\r\f\v]
 \S      --> Any non-whitespace, equivalent to [^\t\n\r\f\v]
 \w      --> Any alphanumeric Character, equivalent to [a-zA-Z0-9_]
 \W      --> Any non-alphanumeric Character, equivalent to [^a-zA-Z0-9_]
    
 Meta Characters : Repetitions
 *       --> Matches zero or more occurences
 +       --> Matches one or more occurences
 ?       --> Matches zero or one occurences
 {n}     --> Matches exactly in repetitions, n>=0
 {n,0}   --> Matches Atleast n repetition    
 {0,n}   --> Matches Atmost n repetition
 {m,n}   --> Atleast Atleast m and Atmost n repetitions  
```

In [29]:
[w for w in text5a if re.search('@\w+', w)]

['@UN', '@UN_Women']

### Find (Ctr + F) all vowels

In [30]:
temp = "ouagadougou"
re.findall(r'[aeiou]', temp)

['o', 'u', 'a', 'a', 'o', 'u', 'o', 'u']

### Find (Ctr + F) all consonants 

In [31]:
re.findall(r'[^aeiou]', temp)

['g', 'd', 'g']

## Date Variations 
01-11-2018
01/11/2018
01/11/18
11/01/2018
11 Nov 2018
11 November 2018
Nov 11, 2018
November 11, 2018

In [32]:
# Regular Expression for Dates
dateStr = '01-11-2018\n01/11/2018\n01/11/18\n11/01/2018\n11 Nov 2018\n11 November 2018\nNov 11, 2018\nNovember 11, 2018\n'
print(re.findall(r'\d{2}[/-]\d{2}[/-]\d{4}', dateStr))
print(re.findall(r'\d{2}[/-]\d{2}[/-]\d{2,4}', dateStr))
print(re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', dateStr))

['01-11-2018', '01/11/2018', '11/01/2018']
['01-11-2018', '01/11/2018', '01/11/18', '11/01/2018']
['01-11-2018', '01/11/2018', '01/11/18', '11/01/2018']


In [33]:
print(re.findall(r'\d{2} (Jan|Feb|Mar|Apr|May|Jun|July|Aug|Sep|Oct|Nov|Dec) \d{4}', dateStr))
print(re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|July|Aug|Sep|Oct|Nov|Dec) \d{4}', dateStr))
print(re.findall(r'\d{2} (?:Jan|Feb|Mar|Apr|May|Jun|July|Aug|Sep|Oct|Nov|Dec)[a-z]* \d{4}', dateStr))
print(re.findall(r'(?:\d{2}? )?(?:Jan|Feb|Mar|Apr|May|Jun|July|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{2}, )?\d{4}', dateStr))
print(re.findall(r'(?:\d{1,2}? )?(?:Jan|Feb|Mar|Apr|May|Jun|July|Aug|Sep|Oct|Nov|Dec)[a-z]* (?:\d{1,2}, )?\d{2,4}', dateStr))

['Nov']
['11 Nov 2018']
['11 Nov 2018', '11 November 2018']
['11 Nov 2018', '11 November 2018', 'Nov 11, 2018', 'November 11, 2018']
['11 Nov 2018', '11 November 2018', 'Nov 11, 2018', 'November 11, 2018']


In [34]:
import pandas as pd
time_sentences = ["Monday: The doctor's appointment is at 2:45pm.", 
                  "Tuesday: The dentist's appointment is at 11:30 am.",
                  "Wednesday: At 7:00pm, there is a basketball game!",
                  "Thursday: Be back home by 11:15 pm at the latest.",
                  "Friday: Take the train at 08:10 am, arrive at 09:00am."]

df = pd.DataFrame(time_sentences, columns=['text'])
df

Unnamed: 0,text
0,Monday: The doctor's appointment is at 2:45pm.
1,Tuesday: The dentist's appointment is at 11:30...
2,"Wednesday: At 7:00pm, there is a basketball game!"
3,Thursday: Be back home by 11:15 pm at the latest.
4,"Friday: Take the train at 08:10 am, arrive at ..."


# To Perform String Operations on Series/Dataframe - .str is used

In [35]:
df['text'].str.len()

0    46
1    50
2    49
3    49
4    54
Name: text, dtype: int64

In [36]:
df['text'].str.split().str.len()

0     7
1     8
2     8
3    10
4    10
Name: text, dtype: int64

In [37]:
print("\nColumns with Word Appointment : \n",df['text'].str.contains('appointment'))
print("\nColumns with Word day : \n",df['text'].str.contains('day'))
print("\nColumns with timestamp at night - pm : \n",df['text'].str.contains('pm'))


Columns with Word Appointment : 
 0     True
1     True
2    False
3    False
4    False
Name: text, dtype: bool

Columns with Word day : 
 0    True
1    True
2    True
3    True
4    True
Name: text, dtype: bool

Columns with timestamp at night - pm : 
 0     True
1    False
2     True
3     True
4    False
Name: text, dtype: bool


#### Find count of digits in the data 

In [38]:
# How many times a digit occurs in every sentence
df['text'].str.count(r'\d')

0    3
1    4
2    3
3    4
4    8
Name: text, dtype: int64

#### Find all unique digits in the data 

In [39]:
df['text'].str.findall(r'\d')

0                   [2, 4, 5]
1                [1, 1, 3, 0]
2                   [7, 0, 0]
3                [1, 1, 1, 5]
4    [0, 8, 1, 0, 0, 9, 0, 0]
Name: text, dtype: object

#### Find time values in the form of digits in the data 

In [40]:
df['text'].str.findall(r'(\d?\d):(\d\d)')

0               [(2, 45)]
1              [(11, 30)]
2               [(7, 00)]
3              [(11, 15)]
4    [(08, 10), (09, 00)]
Name: text, dtype: object

In [41]:
df['text'].str.replace(r'\w+day\b', ':-)   ')

0       :-)   : The doctor's appointment is at 2:45pm.
1    :-)   : The dentist's appointment is at 11:30 am.
2       :-)   : At 7:00pm, there is a basketball game!
3      :-)   : Be back home by 11:15 pm at the latest.
4    :-)   : Take the train at 08:10 am, arrive at ...
Name: text, dtype: object

In [42]:
df['text'].str.replace(r'(\w+day\b)', lambda x: x.group()[0][:3])

0            M: The doctor's appointment is at 2:45pm.
1         T: The dentist's appointment is at 11:30 am.
2            W: At 7:00pm, there is a basketball game!
3           T: Be back home by 11:15 pm at the latest.
4    F: Take the train at 08:10 am, arrive at 09:00am.
Name: text, dtype: object

In [43]:
df['text'].str.extract(r'(\d?\d):(\d\d)')

  """Entry point for launching an IPython kernel.


Unnamed: 0,0,1
0,2,45
1,11,30
2,7,0
3,11,15
4,8,10


In [44]:
df['text'].str.extractall(r'((\d?\d):(\d\d) ?([ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,0,1,2,3
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


## Name groups in str

In [45]:
df['text'].str.extractall(r'(?P<Time>(?P<Hour>\d?\d):(?P<Minute>\d\d) ?(?P<Period>[ap]m))')

Unnamed: 0_level_0,Unnamed: 1_level_0,Time,Hour,Minute,Period
Unnamed: 0_level_1,match,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,2:45pm,2,45,pm
1,0,11:30 am,11,30,am
2,0,7:00pm,7,0,pm
3,0,11:15 pm,11,15,pm
4,0,08:10 am,8,10,am
4,1,09:00am,9,0,am


# NLTK - Natural Language Processing Toolkit

In [46]:
import nltk

In [47]:
#nltk.download()

In [48]:
print(dir(nltk))

['AbstractLazySequence', 'AffixTagger', 'AlignedSent', 'Alignment', 'AnnotationTask', 'ApplicationExpression', 'Assignment', 'BigramAssocMeasures', 'BigramCollocationFinder', 'BigramTagger', 'BinaryMaxentFeatureEncoding', 'BlanklineTokenizer', 'BllipParser', 'BottomUpChartParser', 'BottomUpLeftCornerChartParser', 'BottomUpProbabilisticChartParser', 'Boxer', 'BrillTagger', 'BrillTaggerTrainer', 'CFG', 'CRFTagger', 'CfgReadingCommand', 'ChartParser', 'ChunkParserI', 'ChunkScore', 'ClassifierBasedPOSTagger', 'ClassifierBasedTagger', 'ClassifierI', 'ConcordanceIndex', 'ConditionalExponentialClassifier', 'ConditionalFreqDist', 'ConditionalProbDist', 'ConditionalProbDistI', 'ConfusionMatrix', 'ContextIndex', 'ContextTagger', 'ContingencyMeasures', 'CoreNLPDependencyParser', 'CoreNLPParser', 'Counter', 'CrossValidationProbDist', 'DRS', 'DecisionTreeClassifier', 'DefaultTagger', 'DependencyEvaluator', 'DependencyGrammar', 'DependencyGraph', 'DependencyProduction', 'DictionaryConditionalProbDis

In [49]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


#### Counting the vocabulary of words

In [50]:
text7

<Text: Wall Street Journal>

In [51]:
sents()

sent1: Call me Ishmael .
sent2: The family of Dashwood had long been settled in Sussex .
sent3: In the beginning God created the heaven and the earth .
sent4: Fellow - Citizens of the Senate and of the House of Representatives :
sent5: I have a problem with people PMing me to lol JOIN
sent6: SCENE 1 : [ wind ] [ clop clop clop ] KING ARTHUR : Whoa there !
sent7: Pierre Vinken , 61 years old , will join the board as a nonexecutive director Nov. 29 .
sent8: 25 SEXY MALE , seeks attrac older single lady , for discreet encounters .
sent9: THE suburb of Saffron Park lay on the sunset side of London , as red and ragged as a cloud of sunset .


In [52]:
print(sent7)

['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.']


In [53]:
len(sent7)

18

In [54]:
len(text7)

100676

In [55]:
len(set(text7))

12408

In [56]:
len(set([w.lower() for w in text7]))

11387

In [57]:
print(list(set(text7))[:10])

['installing', 'crossed', 'elected', 'Jefferson', 'syndrome', 'Traded', 'Family', 'advertised', 'Spiro', 'discos']


### Frequency of Words

In [58]:
dist = FreqDist(text7)
print(len(dist))
print(dist)

12408
<FreqDist with 12408 samples and 100676 outcomes>


In [59]:
dist.items()



In [60]:
#df=pd.DataFrame.from_items(dist.items(),columns=['col1','col2'])

In [61]:
type(dist.items())

dict_items

In [62]:
vocab = dist.keys()
list(vocab)[:10]

['Pierre', 'Vinken', ',', '61', 'years', 'old', 'will', 'join', 'the', 'board']

In [63]:
dist['four']

20

#### How many times a large word occurs in the text corpus

In [64]:
freqwords = [w for w in vocab if len(w)>5 and dist[w]>100]
freqwords

['billion',
 'company',
 'president',
 'because',
 'market',
 'million',
 'shares',
 'trading',
 'program']

## Stemming and Lemmatization 
#### https://www.youtube.com/watch?v=p1ccbR2P_xA

### Normalization and Stemming

In [65]:
input1 = "List listed lists listing listings"
words1 = input1.lower().split(' ')
words1

['list', 'listed', 'lists', 'listing', 'listings']

In [66]:
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words1]

['list', 'list', 'list', 'list', 'list']

In [67]:
input2 = "Trouble troubling troubled troubler"
words2 = input2.lower().split(' ')
words2

['trouble', 'troubling', 'troubled', 'troubler']

In [68]:
porter = nltk.PorterStemmer()
[porter.stem(t) for t in words2]

['troubl', 'troubl', 'troubl', 'troubler']

In [69]:
lancast = nltk.LancasterStemmer()
[lancast.stem(t) for t in words2]

['troubl', 'troubl', 'troubl', 'troubl']

### Lemmatization

In [70]:
udhr = nltk.corpus.udhr.words('English-Latin1')
print(udhr[:20])

['Universal', 'Declaration', 'of', 'Human', 'Rights', 'Preamble', 'Whereas', 'recognition', 'of', 'the', 'inherent', 'dignity', 'and', 'of', 'the', 'equal', 'and', 'inalienable', 'rights', 'of']


In [71]:
WNlemma = nltk.WordNetLemmatizer()
print([WNlemma.lemmatize(t) for t in udhr[:20]])

['Universal', 'Declaration', 'of', 'Human', 'Rights', 'Preamble', 'Whereas', 'recognition', 'of', 'the', 'inherent', 'dignity', 'and', 'of', 'the', 'equal', 'and', 'inalienable', 'right', 'of']


### Tokenization

In [72]:
text11 = "Children shouldn't drink a sugary drink before bed."
text11.split(' ')

['Children', "shouldn't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed.']

#### Note that not & . are separate tokens  (inorder to account for negations in the text)

In [73]:
print(nltk.word_tokenize(text11))

['Children', 'should', "n't", 'drink', 'a', 'sugary', 'drink', 'before', 'bed', '.']


#### Note that not all . are end of sentence like in "U.S." and "2.99." 

In [74]:
text12 = "This is the first sentence. A gallon of milk in the U.S. costs $2.99. Is this the third sentence? Yes, it is!"
sentences = nltk.sent_tokenize(text12)
print("There are {} sentences in the above document|text corpus".format(len(sentences)))
sentences

There are 4 sentences in the above document|text corpus


['This is the first sentence.',
 'A gallon of milk in the U.S. costs $2.99.',
 'Is this the third sentence?',
 'Yes, it is!']

## Advanced NLP taks with NLTK
### POS (Parts-of-Speech)Tagging 

In [75]:
from nltk.help import upenn_tagset
dir(upenn_tagset)

['__annotations__',
 '__call__',
 '__class__',
 '__closure__',
 '__code__',
 '__defaults__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__get__',
 '__getattribute__',
 '__globals__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__kwdefaults__',
 '__le__',
 '__lt__',
 '__module__',
 '__name__',
 '__ne__',
 '__new__',
 '__qualname__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__']

In [76]:
nltk.help.upenn_tagset(tagpattern='MD')

MD: modal auxiliary
    can cannot could couldn't dare may might must need ought shall should
    shouldn't will would


In [77]:
nltk.help.upenn_tagset(tagpattern='V*')

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [78]:
text11 = "Children shouldn't drink a sugary drink before bed."
nltk.pos_tag(nltk.word_tokenize(text11))

[('Children', 'NNP'),
 ('should', 'MD'),
 ("n't", 'RB'),
 ('drink', 'VB'),
 ('a', 'DT'),
 ('sugary', 'JJ'),
 ('drink', 'NN'),
 ('before', 'IN'),
 ('bed', 'NN'),
 ('.', '.')]

In [79]:
nltk.help.upenn_tagset(tagpattern='NNP')
nltk.help.upenn_tagset(tagpattern='RB')
nltk.help.upenn_tagset(tagpattern='VB')
nltk.help.upenn_tagset(tagpattern='DT')
nltk.help.upenn_tagset(tagpattern='JJ')
nltk.help.upenn_tagset(tagpattern='NN')
nltk.help.upenn_tagset(tagpattern='IN')

NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...
RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
JJ: adjective or numeral, ordinal
    third ill-mannered pre-war regrettable oiled calamitous first separable
    ectoplasmic battery-powered participatory fourth still-to-be-named
    multilingual multi-disciplinary ...
NN: noun, common, sin

#### Ambiguity in POS Tagging

In [80]:
nltk.pos_tag(nltk.word_tokenize("Visiting aunts can be a nuisance"))

[('Visiting', 'VBG'),
 ('aunts', 'NNS'),
 ('can', 'MD'),
 ('be', 'VB'),
 ('a', 'DT'),
 ('nuisance', 'NN')]

In [81]:
nltk.pos_tag(nltk.word_tokenize("I never said you stole my money"))

[('I', 'PRP'),
 ('never', 'RB'),
 ('said', 'VBD'),
 ('you', 'PRP'),
 ('stole', 'VBP'),
 ('my', 'PRP$'),
 ('money', 'NN')]

#### Parsing sentence structure

In [82]:
text15 = nltk.word_tokenize("Alice loves Bob")
grammar = nltk.CFG.fromstring("""
S -> NP VP
VP -> V NP
NP -> 'Alice' | 'Bob'
V -> 'loves'
""")

parser = nltk.ChartParser(grammar=grammar)
trees = parser.parse_all(text15)
for tree in trees:
    print(tree)

(S (NP Alice) (VP (V loves) (NP Bob)))


#### Ambiguity in Parsing sentences "I saw the man with a telescope" 
#### 1. Did you see with the telescope ?
#### 2. Did you see the man who was holding a telescope ?

In [83]:
text16 = nltk.word_tokenize("I saw the man with a telescope")
grammar1 = nltk.data.load('Data/mygrammar.cfg')
grammar1

<Grammar with 13 productions>

In [84]:
parser = nltk.ChartParser(grammar1)
trees = parser.parse_all(text16)
for tree in trees:
    print(tree)

(S
  (NP I)
  (VP
    (VP (V saw) (NP (Det the) (N man)))
    (PP (P with) (NP (Det a) (N telescope)))))
(S
  (NP I)
  (VP
    (V saw)
    (NP (Det the) (N man) (PP (P with) (NP (Det a) (N telescope))))))


In [85]:
from nltk.corpus import treebank
text17 = treebank.parsed_sents('wsj_0001.mrg')[0]
print(text17)

(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))


### POS tagging and parsing ambiguity

In [86]:
text18 = nltk.word_tokenize("The old man the boat")
nltk.pos_tag(text18)

[('The', 'DT'), ('old', 'JJ'), ('man', 'NN'), ('the', 'DT'), ('boat', 'NN')]

In [87]:
text19 = nltk.word_tokenize("Colorless green ideas sleep furiously")
nltk.pos_tag(text19)

[('Colorless', 'NNP'),
 ('green', 'JJ'),
 ('ideas', 'NNS'),
 ('sleep', 'VBP'),
 ('furiously', 'RB')]

## Case Study : Sentiment Analysis

In [100]:
import time
start = time.time()
df = pd.read_csv('Data/Amazon_Unlocked_Mobile.csv')
end = time.time()

In [101]:
df['Reviews']

0         I feel so LUCKY to have found this used (phone...
1         nice phone, nice up grade from my pantach revu...
2                                              Very pleased
3         It works good but it goes slow sometimes but i...
4         Great phone to replace my lost phone. The only...
5         I already had a phone with problems... I know ...
6         The charging port was loose. I got that solder...
7         Phone looks good but wouldn't stay charged, ha...
8         I originally was using the Samsung S2 Galaxy f...
9         It's battery life is great. It's very responsi...
10        My fiance had this phone previously, but cause...
11        This is a great product it came after two days...
12        These guys are the best! I had a little situat...
13        I'm really disappointed about my phone and ser...
14        Ordered this phone as a replacement for the sa...
15        Had this phone before and loved it but was not...
16        I was able to get the phone I 

### Drop missing values and remove neutral ratings (3)

In [102]:
df.dropna(inplace=True)
df = df[df['Rating']!=3]
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


### Encode Ratings 4,5 as 1 (Positive Sentiment)
### Encode Ratings 1,2 as 0 (Negative Sentiment)

In [104]:
df['Positively_Rated'] = np.where(df['Rating'] >3,1,0)
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes,Positively_Rated
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,1
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,1
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,1
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,1
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,1


### Looking at the mean - we have imbalanced classes

In [105]:
df['Positively_Rated'].mean()

0.7482686025879323

#### Most ratings are positive

In [107]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Positively_Rated'], random_state=0)

print('X_train first entry:\n\n', X_train.iloc[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 I bought a BB Black and was deliveried a White BB.Really is not a serious provider...Next time is better to cancel the order.


X_train shape:  (231207,)


## Count Vectorizor

In [108]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)

In [109]:
vect.get_feature_names()[::2000]

['00',
 '4less',
 'adr6275',
 'assignment',
 'blazingly',
 'cassettes',
 'condishion',
 'debi',
 'dollarsshipping',
 'esteem',
 'flashy',
 'gorila',
 'human',
 'irullu',
 'like',
 'microsaudered',
 'nightmarish',
 'p770',
 'poori',
 'quirky',
 'responseive',
 'send',
 'sos',
 'synch',
 'trace',
 'utiles',
 'withstanding']

In [110]:
len(vect.get_feature_names())

53216

### transform the documents in the training data to a document-term matrix ==> Bad of Words Representation

In [111]:
X_train_vectorized = vect.transform(X_train)
X_train_vectorized

<231207x53216 sparse matrix of type '<class 'numpy.int64'>'
	with 6117776 stored elements in Compressed Sparse Row format>

In [112]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [113]:
from sklearn.metrics import roc_auc_score

y_predictions = model.predict(vect.transform(X_test))
print('AUC : ',roc_auc_score(y_test, y_predictions))

AUC :  0.9279984634090939


In [127]:
# get the feature names as numpy array
feature_names = np.array(vect.get_feature_names())

# Sort the coefficients from the model
sorted_coef_index = model.coef_[0].argsort()

#### Find the 10 smallest and 10 largest coefficients

In [114]:
# The 10 largest coefficients are being indexed using [:-11:-1] 
# so the list returned is in order of largest to smallest
print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'false' 'mony' 'worthless' 'junk' 'garbage' 'messing' 'useless'
 'blacklist' 'unsatisfied']

Largest Coefs: 
['excelent' 'excelente' 'exelente' 'excellent' 'loving' 'efficient'
 'loves' 'perfecto' 'lovely' 'amazing']


## Tf-idf

In [124]:
from sklearn.feature_extraction import text
print([w for w in dir(text) if not w.startswith('_')])

['BaseEstimator', 'CountVectorizer', 'ENGLISH_STOP_WORDS', 'FeatureHasher', 'HashingVectorizer', 'Mapping', 'TfidfTransformer', 'TfidfVectorizer', 'TransformerMixin', 'VectorizerMixin', 'array', 'check_is_fitted', 'defaultdict', 'itemgetter', 'normalize', 'np', 'numbers', 're', 'six', 'sp', 'strip_accents_ascii', 'strip_accents_unicode', 'strip_tags', 'unicode_literals', 'unicodedata', 'xrange']


In [126]:
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

17951

In [130]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

y_predictions = model.predict(vect.transform(X_test))
print('AUC : ',roc_auc_score(y_test, y_predictions))

AUC :  0.9266100666746837


In [131]:
feature_names = np.array(vect.get_feature_names())

sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf: \n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['commenter' 'pthalo' 'warmness' 'storageso' 'aggregration' '1300'
 '625nits' 'a10' 'submarket' 'brawns']

Largest tfidf: 
['defective' 'batteries' 'gooood' 'epic' 'luis' 'goood' 'basico'
 'aceptable' 'problems' 'excellant']


In [132]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['not' 'worst' 'useless' 'disappointed' 'terrible' 'return' 'waste' 'poor'
 'horrible' 'doesn']

Largest Coefs: 
['love' 'great' 'excellent' 'perfect' 'amazing' 'awesome' 'perfectly'
 'easy' 'best' 'loves']


### these reviews are treated the same by our current model but they are not (two different)

In [133]:
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[0 0]


In [139]:
print(model.predict(vect.transform(['it is not that i am so smart it is just that i stay with problems longer'])))

[1]


## n-grams
#### Fit the CountVectorizer to the training data specifiying a minimum document frequency of 5 and extracting 1-grams and 2-grams

In [140]:
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
len(vect.get_feature_names())

198917

In [141]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

y_predictions = model.predict(vect.transform(X_test))

print('AUC: ', roc_auc_score(y_test, y_predictions))

AUC:  0.9671524431795406


In [142]:
feature_names = np.array(vect.get_feature_names())

sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['no good' 'worst' 'junk' 'not good' 'not happy' 'horrible' 'garbage'
 'terrible' 'looks ok' 'nope']

Largest Coefs: 
['not bad' 'excelent' 'excelente' 'excellent' 'perfect' 'no problems'
 'exelente' 'awesome' 'no issues' 'great']


### these reviews are now correctly identified

In [143]:
print(model.predict(vect.transform(['not an issue, phone is working',
                                    'an issue, phone is not working'])))

[1 0]


### Models in NLTK for Text classification

In [156]:
[w for w in dir(nltk) if ('classifier' in w.lower())|('tree' in w.lower())]

['ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConditionalExponentialClassifier',
 'DecisionTreeClassifier',
 'ImmutableMultiParentedTree',
 'ImmutableParentedTree',
 'ImmutableProbabilisticTree',
 'ImmutableTree',
 'MaxentClassifier',
 'MultiClassifierI',
 'MultiParentedTree',
 'NaiveBayesClassifier',
 'ParentedTree',
 'PositiveNaiveBayesClassifier',
 'ProbabilisticTree',
 'SklearnClassifier',
 'Tree',
 'TreebankWordTokenizer',
 'WekaClassifier',
 '__classifiers__',
 'conllstr2tree',
 'conlltags2tree',
 'decisiontree',
 'elementtree_indent',
 'ieerstr2tree',
 'rte_classifier',
 'tagstr2tree',
 'tree',
 'tree2conllstr',
 'tree2conlltags',
 'treebank',
 'treetransforms']

In [159]:
from nltk.classify import NaiveBayesClassifier

vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

classifier = NaiveBayesClassifier.train(X_train_vectorized, y_train)
classifier.classify(X_test)

nltk.classify.util.accuracy(classifier, y_test)
classifier.label()

ValueError: not enough values to unpack (expected 2, got 1)

# WordNet


In [161]:
from nltk.corpus import wordnet as wn

In [165]:
deer = wn.synset('deer.n.01')
elk = wn.synset('elk.n.01')
horse = wn.synset('horse.n.01')

In [163]:
deer.path_similarity(elk)

0.5

In [166]:
deer.path_similarity(horse)

0.14285714285714285

### Information Criteria to find Lin Similarity

In [173]:
from nltk.corpus import wordnet_ic

In [174]:
brown_ic = wordnet_ic.ic('ic-brown.dat')

deer.lin_similarity(elk, brown_ic)

0.8623778273893673

In [175]:
deer.lin_similarity(horse, brown_ic)

0.7726998936065773

## Collocations and Distribution Similarity

In [181]:
import nltk
from nltk.collocations import *
from nltk.book import *

In [182]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(text)
finder.nbest(bigram_measures.pmi, 10)

TypeError: 'module' object is not iterable

In [None]:
finder.apply_freq_filter(10)

## LDA Model

In [None]:
import gensim
from gensim import corpora, models

dictionary = corporta.Dictionary(doc_set)
corpus = [dictionary.doc2bow(doc) for doc in doc_set]
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics =4, id2word = dictionary, passes=50)
print(ldamodel.print_topics(num_topics=4, num_words=5))