In [1]:
import numpy as np
import string 
from sklearn.linear_model import LinearRegression
import nltk
from nltk.corpus import brown
from nltk.corpus import stopwords
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('brown')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB

[nltk_data] Downloading package brown to /Users/albertyou/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/albertyou/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/albertyou/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# NLTK Tokenizer

In [2]:
# Example text to be normalized
text = "Hello, my name is Bob, and I have many cats and dogs."
tokenizer = nltk.tokenize.TreebankWordTokenizer()
# Tokenizes the text, splits up words, punctuation, etc. into a list
tokens = tokenizer.tokenize(text)

In [3]:
# stems the various tokens
stemmer = nltk.stem.PorterStemmer()
" ".join(stemmer.stem(token) for token in tokens)

'hello , my name is bob , and i have mani cat and dog .'

In [4]:
# Lemmatizes the various tokens
stemmer = nltk.stem.WordNetLemmatizer()
" ".join(stemmer.lemmatize(token) for token in tokens)

'Hello , my name is Bob , and I have many cat and dog .'

In [5]:
# You can even make it lower case for futher normalization
stemmer = nltk.stem.WordNetLemmatizer()
" ".join(stemmer.lemmatize(token.lower()) for token in tokens)

'hello , my name is bob , and i have many cat and dog .'

# Brown Corpus Example

In [6]:
# ID	File	Genre	Description
# A16	ca16	news	Chicago Tribune: Society Reportage
# B02	cb02	editorial	Christian Science Monitor: Editorials
# C17	cc17	reviews	Time Magazine: Reviews
# D12	cd12	religion	Underwood: Probing the Ethics of Realtors
# E36	ce36	hobbies	Norling: Renting a Car in Europe
# F25	cf25	lore	Boroff: Jewish Teenage Culture
# G22	cg22	belles_lettres	Reiner: Coping with Runaway Technology
# H15	ch15	government	US Office of Civil and Defence Mobilization: The Family Fallout Shelter
# J17	cj19	learned	Mosteller: Probability with Statistical Applications
# K04	ck04	fiction	W.E.B. Du Bois: Worlds of Color
# L13	cl13	mystery	Hitchens: Footsteps in the Night
# M01	cm01	science_fiction	Heinlein: Stranger in a Strange Land
# N14	cn15	adventure	Field: Rattlesnake Ridgez
# P12	cp12	romance	Callaghan: A Passion in Rome
# R06	cr06	humor	Thurber: The Future, If Any, of Comedy

In [7]:
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [8]:
categories = brown.categories()

In [9]:
brown.words(categories = 'fiction')

['Thirty-three', 'Scotty', 'did', 'not', 'go', 'back', ...]

In [10]:
brown.words(categories = 'news')

['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]

In [11]:
# Tensor of order 2 based on sentences
brown.sents(categories = 'news')

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [12]:
# Essentially, a numpy array of the brown.words corpus
wordArray = np.array(brown.words())

In [13]:
print(type(wordArray))
print(type(wordArray[0]))

<class 'numpy.ndarray'>
<class 'numpy.str_'>


In [14]:
# There are 1,161,192 words
count = 0
for i in wordArray:
    count += 1
print(count)
# Slicing and indexing 
print(wordArray)
print(wordArray[1])
print(wordArray[1:5])


1161192
['The' 'Fulton' 'County' ... 'was' 'stupefying' '.']
Fulton
['Fulton' 'County' 'Grand' 'Jury']


# Conditional Frequency Distribution

In [15]:
# Through nltk, you can get the categories of the brown corpus
news_text = brown.words(categories='news')

# This shows the frequency of the Modals words in the news category
fdist = nltk.FreqDist(w.lower() for w in news_text)
modals = ['who', 'what', 'when', 'where', 'why', 'how']
for m in modals:
    print(m + ':', fdist[m], end=' ')

who: 268 what: 95 when: 169 where: 59 why: 14 how: 42 

In [16]:
# Conditional frequency distribution in relation to various genres and words
cfd = nltk.ConditionalFreqDist(
    (genre, word.lower())
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['who', 'what', 'when', 'where', 'why', 'how']

# Shows the data in a nice tabulated format
cfd.tabulate(conditions=genres, samples=modals)

                  who  what  when where   why   how 
           news   268    95   169    59    14    42 
       religion   102    86    68    21    20    28 
        hobbies   104   108   164    77    17    72 
science_fiction    13    41    28    15     8    16 
        romance    95   171   163    58    62    77 
          humor    49    46    62    16    13    25 


In [17]:
# creates a wordlist of lowercase and stop words from original wordArray
wordsList = []
for word in wordArray:
    if word.lower() not in stopwords.words('english'):
        wordsList.append(word.lower())
wordsList = np.array(wordsList)

In [18]:
# As you can see here
print(type(wordsList))
print(len(wordsList))
print(wordsList[0:5])

<class 'numpy.ndarray'>
686163
['fulton' 'county' 'grand' 'jury' 'said']


In [19]:
cfd.tabulate(conditions=categories, samples=wordsList[0:5])

                fulton county  grand   jury   said 
      adventure      0      3      1      5    288 
 belles_lettres      2      8      5      5    161 
      editorial      0     31      2      0     52 
        fiction      0      4      1      0    194 
     government      0      1      2      0     18 
        hobbies      0      8      4      0     11 
          humor      0      1      2      0     88 
        learned      1     18      5      3     35 
           lore      0      8      1      4     89 
        mystery      0      6      0      1    204 
           news     14     61     19     46    406 
       religion      0      0      0      0     27 
        reviews      0      5      5      1     12 
        romance      0      1      1      2    331 
science_fiction      0      0      0      0     45 


# Counting Words by Genre

In [20]:
genre_word = [(genre, word.lower()) 
              for genre in ['news', 'romance']
                  for word in brown.words(categories=genre)]

In [21]:
print(genre_word[0])
print("with stopwords", len(genre_word))

genre_wordTemp = []
for i in range(len(genre_word)):
    if genre_word[i][1] not in stopwords.words('english'):
            genre_wordTemp.append(genre_word[i])

genre_word = genre_wordTemp

print("after no stopwords", len(genre_word))

('news', 'the')
with stopwords 170576
after no stopwords 103897


In [22]:
# We create a conditional 
cfd = nltk.ConditionalFreqDist(genre_word)
print(cfd.conditions())
print(cfd['news'])
print(cfd['romance'])
print()

print("NEWS (top 20)): ")
print(cfd['news'].most_common(20))
print()

print("ROMANCE (top 20): ")
print(cfd['romance'].most_common(20))
print()

print("How often 'say' up for romance: ")
print(cfd['romance']['say'])

['news', 'romance']
<FreqDist with 12967 samples and 63739 outcomes>
<FreqDist with 7737 samples and 40158 outcomes>

NEWS (top 20)): 
[(',', 5188), ('.', 4030), ('``', 732), ("''", 702), ('said', 406), (';', 314), ('--', 300), ('mrs.', 253), ('would', 246), ('new', 241), ('one', 213), ('last', 177), ('two', 174), (')', 171), ('mr.', 170), ('(', 168), ('first', 158), ('state', 153), (':', 149), ('year', 142)]

ROMANCE (top 20): 
[(',', 3899), ('.', 3736), ('``', 1045), ("''", 1044), ('?', 690), ('said', 331), ('!', 316), ('--', 291), (';', 264), ('would', 247), ('could', 195), ('like', 189), ('one', 182), ('back', 128), ('thought', 106), ('little', 104), ('man', 100), ('get', 95), ('time', 94), ('old', 90)]

How often 'say' up for romance: 
61


In [54]:
# Let's remove the punctuation

# this is a string of characters
print(type(string.punctuation)) 
print(string.punctuation)

# tokenize the punctuation characters into list
punct = string.punctuation + "``--''"


<class 'str'>
!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [55]:
genre_word = [(genre, word.lower()) 
              for genre in ['news', 'romance']
                  for word in brown.words(categories=genre)]

genre_wordTemp = []
for i in range(len(genre_word)):
    if genre_word[i][1] not in stopwords.words('english'):
        if genre_word[i][1] not in punct:
            genre_wordTemp.append((genre_word[i][0], stemmer.lemmatize(genre_word[i][1])))

genre_word = genre_wordTemp


In [56]:
print(type(genre_word))
# Punctuation is removed
print(len(genre_word)) 

<class 'list'>
80536


In [57]:
# Let's see if the punctuation is here now
cfd = nltk.ConditionalFreqDist(genre_word)
print(cfd.conditions())
print(cfd['news'])
print(cfd['romance'])
print()

print("NEWS (top 20)): ")
print(cfd['news'].most_common(20))
print()

print("ROMANCE (top 20): ")
print(cfd['romance'].most_common(20))
print()

print("How often 'say' up for romance: ")
print(cfd['romance']['say'])

['news', 'romance']
<FreqDist with 11739 samples and 51777 outcomes>
<FreqDist with 7148 samples and 28759 outcomes>

NEWS (top 20)): 
[('said', 406), ('mrs.', 253), ('would', 246), ('year', 244), ('new', 241), ('one', 221), ('state', 213), ('last', 177), ('two', 174), ('mr.', 170), ('first', 158), ('president', 143), ('home', 141), ('also', 129), ('school', 125), ('time', 123), ('week', 120), ('day', 116), ('member', 109), ('made', 107)]

ROMANCE (top 20): 
[('said', 331), ('would', 247), ('could', 195), ('one', 192), ('like', 192), ('back', 128), ('thought', 109), ('time', 108), ('little', 104), ('man', 100), ('get', 99), ('day', 96), ('know', 91), ('old', 90), ('got', 90), ('way', 88), ('eye', 88), ('never', 87), ('go', 87), ('even', 86)]

How often 'say' up for romance: 
68
