# Regular expressions

In [1]:
import re

In [2]:
# regular expressions to recognize several different human greetings at the start of a conversation
r = "(hi|hello|hey)[ ]*([a-z]*)"
re.match(r,'Hello Rosa',flags=re.IGNORECASE)


<re.Match object; span=(0, 10), match='Hello Rosa'>

In [3]:
re.match(r,"hi ho, hi ho, it's off to work...")

<re.Match object; span=(0, 5), match='hi ho'>

In [4]:
re.match(r,"hi ho")

<re.Match object; span=(0, 5), match='hi ho'>

In [5]:
re.match(r,"hey! whats up")

<re.Match object; span=(0, 3), match='hey'>

In [6]:
re.match(r,"hi asd")

<re.Match object; span=(0, 6), match='hi asd'>

# Word Tokenization

In [7]:
sentence = """Thomas Jefferson began building Monticello at the
... age of 26."""

In [8]:
sentence.split()

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [9]:
str.split(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [10]:
import numpy as np
token_sequence = str.split(sentence)
vocab = sorted(set(token_sequence))
', '.join(vocab)

'26., Jefferson, Monticello, Thomas, age, at, began, building, of, the'

In [11]:
num_tokens = len(token_sequence)
vocab_size = len(vocab)
one_hot_vectors = np.zeros((num_tokens,vocab_size),int)
one_hot_vectors

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [12]:
for i,word in enumerate(token_sequence):
    one_hot_vectors[i,vocab.index(word)] =1
" ".join(vocab)

'26. Jefferson Monticello Thomas age at began building of the'

In [13]:
one_hot_vectors

array([[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [14]:
token_sequence

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26.']

In [15]:
import pandas as pd
df = pd.DataFrame(one_hot_vectors,columns=vocab)
df

Unnamed: 0,26.,Jefferson,Monticello,Thomas,age,at,began,building,of,the
0,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,1,0,0,0
3,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,1,0,0,0,0
6,0,0,0,0,0,0,0,0,0,1
7,0,0,0,0,1,0,0,0,0,0
8,0,0,0,0,0,0,0,0,1,0
9,1,0,0,0,0,0,0,0,0,0


In [16]:
sentence_bow = {}
for word in sentence.split():
    sentence_bow[word]=1
sentence_bow

{'Thomas': 1,
 'Jefferson': 1,
 'began': 1,
 'building': 1,
 'Monticello': 1,
 'at': 1,
 'the': 1,
 'age': 1,
 'of': 1,
 '26.': 1}

In [32]:
df = pd.DataFrame(pd.Series(dict([token,1] for token in sentence.split())),columns=['sent']).T

In [34]:
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,the,age,of,26.
sent,1,1,1,1,1,1,1,1,1,1


In [35]:
sentences = """Thomas Jefferson began building Monticello at the\
... age of 26.\n"""
sentences += """Construction was done mostly by local masons and\
... carpenters.\n"""
sentences += "He moved into the South Pavilion in 1770.\n"
sentences += """Turning Monticello into a neoclassical masterpiece\
... was Jefferson's obsession."""

In [44]:
corpus = {}
for i,sent in enumerate(sentences.split('\n')):
    corpus[f'sent{i}']=dict((tok,1)for tok in sent.split())
df = pd.DataFrame.from_records(corpus).fillna(0).T

In [45]:
df

Unnamed: 0,Thomas,Jefferson,began,building,Monticello,at,theage,of,26.,Construction,...,South,Pavilion,in,1770.,Turning,a,neoclassical,masterpiecewas,Jefferson's,obsession.
sent0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sent1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
sent2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
sent3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0


In [49]:
a = pd.np.array([1,2,3])
b = pd.np.array([[4,5,6],
                [4,5,6],
                [4,5,6]])
np.dot(a,b)

  a = pd.np.array([1,2,3])
  b = pd.np.array([[4,5,6],


array([24, 30, 36])

In [54]:
a = pd.np.array([[1,2,3],
                [1,2,3]])
b = pd.np.array([[4,5,6,4],
                [4,5,6,4],
                [4,5,6,4]])
np.dot(a,b)

  a = pd.np.array([[1,2,3],
  b = pd.np.array([[4,5,6,4],


array([[24, 30, 36, 24],
       [24, 30, 36, 24]])

In [64]:
df = df.T

In [66]:
df.sent0.dot(df.sent1)

0.0

In [67]:
df.sent0.dot(df.sent2)

0.0

In [68]:
df.sent0.dot(df.sent3)

1.0

In [71]:
import re
sentence = """Thomas Jefferson began building Monticello at the\
... age of 26."""


In [73]:
tokens = re.split(r'[-\s?.,;!]+',sentence)
tokens

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26',
 '']

In [77]:
list(filter(lambda x: x if x not in '- \t.;!?' else None,tokens))

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 'age',
 'of',
 '26']

In [81]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+|$[0-9.]+|\S+')
tokenizer.tokenize(sentence)

['Thomas',
 'Jefferson',
 'began',
 'building',
 'Monticello',
 'at',
 'the',
 '...',
 'age',
 'of',
 '26',
 '.']

In [82]:
from nltk.tokenize import TreebankWordTokenizer
sentence = """Monticello wasn't designated as UNESCO World Heritage\
... Site until 1987."""

In [83]:
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(sentence)

['Monticello',
 'was',
 "n't",
 'designated',
 'as',
 'UNESCO',
 'World',
 'Heritage',
 '...',
 'Site',
 'until',
 '1987',
 '.']

In [91]:
# Casual tokenizer 

message = """RT @TJMonticello Best day everrrrrrr at Monticello.\
... Awesommmmmmeeeeeeee day :*)"""
from nltk.tokenize import casual_tokenize
casual_tokenize(message,reduce_len=True,strip_handles=True)

['RT',
 'Best',
 'day',
 'everrr',
 'at',
 'Monticello',
 '...',
 'Awesommmeee',
 'day',
 ':*)']

In [104]:
# n-grams (2-grams)
from nltk.util import ngrams
sentence = """Thomas Jefferson began building Monticello at the\
... age of 26."""
pattern = re.compile(r"([-\s.,;!?])+")
tokens = pattern.split(sentence)
tokens = list(filter(lambda x: x if x not in '- \t\n.,;!?' else None,tokens))
list(ngrams(tokens,2))


[('Thomas', 'Jefferson'),
 ('Jefferson', 'began'),
 ('began', 'building'),
 ('building', 'Monticello'),
 ('Monticello', 'at'),
 ('at', 'the'),
 ('the', 'age'),
 ('age', 'of'),
 ('of', '26')]

In [106]:
# Joining the tuples as one from above list
tokens = list(ngrams(tokens,2))
list(" ".join(x) for x in tokens)

['Thomas Jefferson',
 'Jefferson began',
 'began building',
 'building Monticello',
 'Monticello at',
 'at the',
 'the age',
 'age of',
 'of 26']

In [107]:
# Stopwords
import nltk
nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
len(stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91866\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


179

In [108]:
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [110]:
# Normalizing vocabulary
# Casefold
tokens = ['House', 'Visitor', 'Center']
normalized_tokens = [str.lower(token) for token in tokens]
print(normalized_tokens)

['house', 'visitor', 'center']


In [114]:
# Normalizing vocabulary
# Stemming
from nltk.stem.porter import PorterStemmer
x = "dish washer's washed dishes"
stemmer = PorterStemmer()
' '.join([stemmer.stem(word) for word in x.split()])

"dish washer' wash dish"

In [115]:
# Normalizing vocabulary
# Lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize('Better')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\91866\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


'Better'

In [116]:
lemmatizer.lemmatize('good')

'good'

In [117]:
lemmatizer.lemmatize('goods')

'good'

In [118]:
lemmatizer.lemmatize('goodness')

'goodness'

In [119]:
lemmatizer.lemmatize('best')

'best'

In [128]:
# Sentiment analysis with VADER
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sa = SentimentIntensityAnalyzer()
sa.lexicon

{'$:': -1.5,
 '%)': -0.4,
 '%-)': -1.5,
 '&-:': -0.4,
 '&:': -0.7,
 "( '}{' )": 1.6,
 '(%': -0.9,
 "('-:": 2.2,
 "(':": 2.3,
 '((-:': 2.1,
 '(*': 1.1,
 '(-%': -0.7,
 '(-*': 1.3,
 '(-:': 1.6,
 '(-:0': 2.8,
 '(-:<': -0.4,
 '(-:o': 1.5,
 '(-:O': 1.5,
 '(-:{': -0.1,
 '(-:|>*': 1.9,
 '(-;': 1.3,
 '(-;|': 2.1,
 '(8': 2.6,
 '(:': 2.2,
 '(:0': 2.4,
 '(:<': -0.2,
 '(:o': 2.5,
 '(:O': 2.5,
 '(;': 1.1,
 '(;<': 0.3,
 '(=': 2.2,
 '(?:': 2.1,
 '(^:': 1.5,
 '(^;': 1.5,
 '(^;0': 2.0,
 '(^;o': 1.9,
 '(o:': 1.6,
 ")':": -2.0,
 ")-':": -2.1,
 ')-:': -2.1,
 ')-:<': -2.2,
 ')-:{': -2.1,
 '):': -1.8,
 '):<': -1.9,
 '):{': -2.3,
 ');<': -2.6,
 '*)': 0.6,
 '*-)': 0.3,
 '*-:': 2.1,
 '*-;': 2.4,
 '*:': 1.9,
 '*<|:-)': 1.6,
 '*\\0/*': 2.3,
 '*^:': 1.6,
 ',-:': 1.2,
 "---'-;-{@": 2.3,
 '--<--<@': 2.2,
 '.-:': -1.2,
 '..###-:': -1.7,
 '..###:': -1.9,
 '/-:': -1.3,
 '/:': -1.3,
 '/:<': -1.4,
 '/=': -0.9,
 '/^:': -1.0,
 '/o:': -1.4,
 '0-8': 0.1,
 '0-|': -1.2,
 '0:)': 1.9,
 '0:-)': 1.4,
 '0:-3': 1.5,
 '0:03': 1.9,
 '

In [129]:
[(tok,score) for tok,score in sa.lexicon.items()]

[('$:', -1.5),
 ('%)', -0.4),
 ('%-)', -1.5),
 ('&-:', -0.4),
 ('&:', -0.7),
 ("( '}{' )", 1.6),
 ('(%', -0.9),
 ("('-:", 2.2),
 ("(':", 2.3),
 ('((-:', 2.1),
 ('(*', 1.1),
 ('(-%', -0.7),
 ('(-*', 1.3),
 ('(-:', 1.6),
 ('(-:0', 2.8),
 ('(-:<', -0.4),
 ('(-:o', 1.5),
 ('(-:O', 1.5),
 ('(-:{', -0.1),
 ('(-:|>*', 1.9),
 ('(-;', 1.3),
 ('(-;|', 2.1),
 ('(8', 2.6),
 ('(:', 2.2),
 ('(:0', 2.4),
 ('(:<', -0.2),
 ('(:o', 2.5),
 ('(:O', 2.5),
 ('(;', 1.1),
 ('(;<', 0.3),
 ('(=', 2.2),
 ('(?:', 2.1),
 ('(^:', 1.5),
 ('(^;', 1.5),
 ('(^;0', 2.0),
 ('(^;o', 1.9),
 ('(o:', 1.6),
 (")':", -2.0),
 (")-':", -2.1),
 (')-:', -2.1),
 (')-:<', -2.2),
 (')-:{', -2.1),
 ('):', -1.8),
 ('):<', -1.9),
 ('):{', -2.3),
 (');<', -2.6),
 ('*)', 0.6),
 ('*-)', 0.3),
 ('*-:', 2.1),
 ('*-;', 2.4),
 ('*:', 1.9),
 ('*<|:-)', 1.6),
 ('*\\0/*', 2.3),
 ('*^:', 1.6),
 (',-:', 1.2),
 ("---'-;-{@", 2.3),
 ('--<--<@', 2.2),
 ('.-:', -1.2),
 ('..###-:', -1.7),
 ('..###:', -1.9),
 ('/-:', -1.3),
 ('/:', -1.3),
 ('/:<', -1.4),

In [135]:
sa.polarity_scores("Python is very readable and it's great for NLP. :(")

{'neg': 0.193, 'neu': 0.533, 'pos': 0.273, 'compound': 0.296}

In [136]:
sa.polarity_scores("Python is not a bad choice for most applications.")

{'neg': 0.0, 'neu': 0.711, 'pos': 0.289, 'compound': 0.431}

In [4]:
# Sentiment analysis (Naive Bayes)
import pandas as pd
import numpy as np
movies = pd.read_csv(r"C:\Users\91866\Downloads\hutto_movies.csv")

In [6]:
from nltk.tokenize import casual_tokenize
bag_of_words = []
from collections import Counter
for text in movies['text']:
    bag_of_words.append(Counter(casual_tokenize(text)))
df_bows = pd.DataFrame.from_records(bag_of_words)
df_bows = df_bows.fillna(0).astype(int)
df_bows

Unnamed: 0,The,Rock,is,destined,to,be,the,21st,Century's,new,...,Ill,slummer,Rashomon,dipsticks,Bearable,Staggeringly,’,ve,muttering,dissing
0,1,1,1,1,2,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,0,4,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10600,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10601,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10602,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10603,0,0,0,0,0,0,2,0,0,0,...,0,0,0,0,0,0,2,1,0,0


In [25]:
pd.DataFrame.from_records(list(({'A':1},{'B':2},{'C':3})))


Unnamed: 0,A,B,C
0,1.0,,
1,,2.0,
2,,,3.0


In [36]:
from sklearn.naive_bayes import BernoulliNB,GaussianNB, MultinomialNB
model = MultinomialNB()
model.fit(df_bows,movies.sentiment>0)

In [64]:
movies['predicted_sentiment'] = model.predict_proba(df_bows)[:,1]*8-4
movies['is_positive_original'] = (movies['sentiment']>0).astype(int)
movies['is_positive_predicted'] = (movies['predicted_sentiment']>0).astype(int)

In [65]:
movies

Unnamed: 0,id,sentiment,text,predicted_sentiment,is_positive_original,is_positive_predicted
0,1,2.266667,The Rock is destined to be the 21st Century's ...,2.511515,1,1
1,2,3.533333,The gorgeously elaborate continuation of ''The...,3.999904,1,1
2,3,-0.600000,Effective but too tepid biopic,-3.655976,0,0
3,4,1.466667,If you sometimes like to go to the movies to h...,1.940954,1,1
4,5,1.733333,"Emerges as something rare, an issue movie that...",3.910373,1,1
...,...,...,...,...,...,...
10600,10601,-0.062500,Well made but mush hearted.,-3.166489,0,0
10601,10602,-1.500000,A real snooze.,-1.056805,0,0
10602,10603,-0.625000,No surprises.,-1.481449,0,0
10603,10604,1.437500,We’ve seen the hippie turned yuppie plot befor...,3.988988,1,1


In [66]:
from sklearn.metrics import classification_report
print(classification_report(movies['is_positive_original'],movies['is_positive_predicted']))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      5363
           1       0.94      0.92      0.93      5242

    accuracy                           0.93     10605
   macro avg       0.93      0.93      0.93     10605
weighted avg       0.93      0.93      0.93     10605



In [67]:
from nltk.tokenize import TreebankWordTokenizer
sentence = """The faster Harry got to the store, the faster Harry,
the faster, would get home."""

In [69]:
tokenizer = TreebankWordTokenizer()
token = tokenizer.tokenize(sentence)
token

['The',
 'faster',
 'Harry',
 'got',
 'to',
 'the',
 'store',
 ',',
 'the',
 'faster',
 'Harry',
 ',',
 'the',
 'faster',
 ',',
 'would',
 'get',
 'home',
 '.']

In [71]:
from collections import Counter
bag_of_words = Counter(token)
bag_of_words

Counter({'The': 1,
         'faster': 3,
         'Harry': 2,
         'got': 1,
         'to': 1,
         'the': 3,
         'store': 1,
         ',': 3,
         'would': 1,
         'get': 1,
         'home': 1,
         '.': 1})

In [73]:
bag_of_words.most_common(4)

[('faster', 3), ('the', 3), (',', 3), ('Harry', 2)]

In [75]:
times_harry_appears = bag_of_words['Harry']
num_unique_words = len(bag_of_words)

In [76]:
times_harry_appears/num_unique_words

0.16666666666666666

In [85]:
from collections import Counter
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
file_path = open(r"C:\Users\91866\Downloads\kite_text.txt",'r')
with file_path as file:
    x = file.read()
x        

"A kite is traditionally a tethered heavier-than-air craft with wing surfaces that react against the air to create lift and drag. A kite consists of wings, tethers, and anchors. Kites often have a bridle to guide the face of the kite at the correct angle so the wind can lift it. A kite's wing also may be so designed so a bridle is not needed; when kiting a sailplane for launch, the tether meets the wing at a single point. A kite may have fixed or moving anchors. Untraditionally in technical kiting, a kite consists of tether-set-coupled wing sets; even in technical kiting, though, a wing in the system is still often called the kite.\n\nThe lift that sustains the kite in flight is generated when air flows around the kite's surface, producing low pressure above and high pressure below the wings. The interaction with the wind also generates horizontal drag along the direction of the wind. The resultant force vector from the lift and drag force components is opposed by the tension of one or

In [86]:
tokens = tokenizer.tokenize(x.lower())
token_counts = Counter(tokens)
token_counts

Counter({'a': 20,
         'kite': 16,
         'is': 7,
         'traditionally': 1,
         'tethered': 2,
         'heavier-than-air': 1,
         'craft': 2,
         'with': 2,
         'wing': 5,
         'surfaces': 1,
         'that': 2,
         'react': 1,
         'against': 1,
         'the': 26,
         'air': 2,
         'to': 5,
         'create': 1,
         'lift': 4,
         'and': 10,
         'drag.': 1,
         'consists': 2,
         'of': 10,
         'wings': 1,
         ',': 15,
         'tethers': 2,
         'anchors.': 2,
         'kites': 8,
         'often': 2,
         'have': 4,
         'bridle': 2,
         'guide': 1,
         'face': 1,
         'at': 3,
         'correct': 1,
         'angle': 1,
         'so': 3,
         'wind': 2,
         'can': 3,
         'it.': 1,
         "'s": 2,
         'also': 3,
         'may': 4,
         'be': 5,
         'designed': 2,
         'not': 1,
         'needed': 1,
         ';': 2,
         'when': 2,


In [91]:
import nltk
nltk.download('stopwords',quiet=True)
stopwords = nltk.corpus.stopwords.words('english')
tokens = [x for x in tokens if x not in stopwords]
kite_counts = Counter(tokens)

In [92]:
kite_counts

Counter({'kite': 16,
         'traditionally': 1,
         'tethered': 2,
         'heavier-than-air': 1,
         'craft': 2,
         'wing': 5,
         'surfaces': 1,
         'react': 1,
         'air': 2,
         'create': 1,
         'lift': 4,
         'drag.': 1,
         'consists': 2,
         'wings': 1,
         ',': 15,
         'tethers': 2,
         'anchors.': 2,
         'kites': 8,
         'often': 2,
         'bridle': 2,
         'guide': 1,
         'face': 1,
         'correct': 1,
         'angle': 1,
         'wind': 2,
         'it.': 1,
         "'s": 2,
         'also': 3,
         'may': 4,
         'designed': 2,
         'needed': 1,
         ';': 2,
         'kiting': 3,
         'sailplane': 1,
         'launch': 1,
         'tether': 1,
         'meets': 1,
         'single': 1,
         'point.': 1,
         'fixed': 1,
         'moving': 2,
         'untraditionally': 1,
         'technical': 2,
         'tether-set-coupled': 1,
         'sets': 1,

In [93]:
# Ordered collection

document_vector = []
doc_length = len(tokens)
for key,value in kite_counts.most_common():
    document_vector.append(value/doc_length)
document_vector

[0.07207207207207207,
 0.06756756756756757,
 0.036036036036036036,
 0.02252252252252252,
 0.018018018018018018,
 0.018018018018018018,
 0.013513513513513514,
 0.013513513513513514,
 0.013513513513513514,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.009009009009009009,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.0045045045045045045,
 0.

In [94]:
docs = ["The faster Harry got to the store, the faster and faster Harry would get home."]
docs.append("Harry is hairy and faster than Jill.")
docs.append("Jill is not as hairy as Harry.")

In [95]:
docs

['The faster Harry got to the store, the faster and faster Harry would get home.',
 'Harry is hairy and faster than Jill.',
 'Jill is not as hairy as Harry.']

In [98]:
doc_tokens = []
for doc in docs:
    doc_tokens += [sorted(tokenizer.tokenize(doc.lower()))]
len(doc_tokens)

3

In [99]:
doc_tokens

[[',',
  '.',
  'and',
  'faster',
  'faster',
  'faster',
  'get',
  'got',
  'harry',
  'harry',
  'home',
  'store',
  'the',
  'the',
  'the',
  'to',
  'would'],
 ['.', 'and', 'faster', 'hairy', 'harry', 'is', 'jill', 'than'],
 ['.', 'as', 'as', 'hairy', 'harry', 'is', 'jill', 'not']]

In [100]:
all_doc_tokens = sum(doc_tokens,[])
all_doc_tokens

[',',
 '.',
 'and',
 'faster',
 'faster',
 'faster',
 'get',
 'got',
 'harry',
 'harry',
 'home',
 'store',
 'the',
 'the',
 'the',
 'to',
 'would',
 '.',
 'and',
 'faster',
 'hairy',
 'harry',
 'is',
 'jill',
 'than',
 '.',
 'as',
 'as',
 'hairy',
 'harry',
 'is',
 'jill',
 'not']

In [102]:
lexicon = sorted(set(all_doc_tokens))
lexicon

[',',
 '.',
 'and',
 'as',
 'faster',
 'get',
 'got',
 'hairy',
 'harry',
 'home',
 'is',
 'jill',
 'not',
 'store',
 'than',
 'the',
 'to',
 'would']

In [103]:
from collections import OrderedDict
zero_vector = OrderedDict((token,0)for token in lexicon)
zero_vector

OrderedDict([(',', 0),
             ('.', 0),
             ('and', 0),
             ('as', 0),
             ('faster', 0),
             ('get', 0),
             ('got', 0),
             ('hairy', 0),
             ('harry', 0),
             ('home', 0),
             ('is', 0),
             ('jill', 0),
             ('not', 0),
             ('store', 0),
             ('than', 0),
             ('the', 0),
             ('to', 0),
             ('would', 0)])

In [105]:
import copy
doc_vectors = []
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize(doc.lower())
    token_counts = Counter(tokens)
    for key, value in token_counts.items():
        vec[key] = value/len(lexicon)
    doc_vectors.append(vec)

In [106]:
doc_vectors

[OrderedDict([(',', 0.05555555555555555),
              ('.', 0.05555555555555555),
              ('and', 0.05555555555555555),
              ('as', 0),
              ('faster', 0.16666666666666666),
              ('get', 0.05555555555555555),
              ('got', 0.05555555555555555),
              ('hairy', 0),
              ('harry', 0.1111111111111111),
              ('home', 0.05555555555555555),
              ('is', 0),
              ('jill', 0),
              ('not', 0),
              ('store', 0.05555555555555555),
              ('than', 0),
              ('the', 0.16666666666666666),
              ('to', 0.05555555555555555),
              ('would', 0.05555555555555555)]),
 OrderedDict([(',', 0),
              ('.', 0.05555555555555555),
              ('and', 0.05555555555555555),
              ('as', 0),
              ('faster', 0.05555555555555555),
              ('get', 0),
              ('got', 0),
              ('hairy', 0.05555555555555555),
              ('harry', 0.05

In [107]:
import nltk
nltk.download('brown')
from nltk.corpus import brown
brown.words()[:10]

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\91866\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of']

In [109]:
brown.tagged_words()

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [110]:
from collections import Counter
puncs = set((',', '.', '--', '-', '!', '?', ':', ';', '``', "''", '(', ')', '[', ']'))
word_list = [x.lower() for x in brown.words() if x not in puncs]
token_counts = Counter(word_list)
token_counts.most_common(20)

[('the', 69971),
 ('of', 36412),
 ('and', 28853),
 ('to', 26158),
 ('a', 23195),
 ('in', 21337),
 ('that', 10594),
 ('is', 10109),
 ('was', 9815),
 ('he', 9548),
 ('for', 9489),
 ('it', 8760),
 ('with', 7289),
 ('as', 7253),
 ('his', 6996),
 ('on', 6741),
 ('be', 6377),
 ('at', 5372),
 ('by', 5306),
 ('i', 5164)]

In [111]:
# IDF
# Read the intro file
file = open(r"c:/Users/91866/Downloads/kite_text.txt",'r')
with file:
    kite_intro = file.read()
# Read the history file
file = open(r"c:/Users/91866/Downloads/kite_history.txt")
with file:
    kite_history = file.read()

intro_tokens = tokenizer.tokenize(kite_intro)
history_tokens = tokenizer.tokenize(kite_history)

intro_total = len(intro_tokens)
print(f'Length of Intro tokens: {intro_total}')

history_total = len(history_tokens)
print(f'Length of Intro tokens: {history_total}')

Length of Intro tokens: 363
Length of Intro tokens: 297


In [112]:
intro_tf = {}
history_tf = {}
intro_counts = Counter(intro_tokens)
history_counts = Counter(history_tokens)
intro_tf['kite'] = intro_counts['kite']/intro_total
history_tf['kite'] = history_counts['kite']/history_total
print(intro_tf)
print(history_tf)

{'kite': 0.0440771349862259}
{'kite': 0.020202020202020204}


In [113]:
intro_tf = {}
history_tf = {}
intro_counts = Counter(intro_tokens)
history_counts = Counter(history_tokens)
intro_tf['and'] = intro_counts['and']/intro_total
history_tf['and'] = history_counts['and']/history_total
print(intro_tf)
print(history_tf)

{'and': 0.027548209366391185}
{'and': 0.030303030303030304}


In [120]:
# Relevance ranking
document_tfidf = []
for doc in docs:
    vec = copy.copy(zero_vector)
    tokens = tokenizer.tokenize((doc.lower()))
    token_counts = Counter(tokens)
    for key,value in token_counts.items():
        docs_containing_key = 0
        for _doc in docs:
            if key in _doc:
                docs_containing_key+=1
        tf = value/len(lexicon)
        if docs_containing_key:
            idf = len(docs)/docs_containing_key
        else:
            idf = 0
        vec[key] = tf*idf
    document_tfidf.append(vec)

In [121]:
vec

OrderedDict([(',', 0),
             ('.', 0.05555555555555555),
             ('and', 0),
             ('as', 0.1111111111111111),
             ('faster', 0),
             ('get', 0),
             ('got', 0),
             ('hairy', 0.08333333333333333),
             ('harry', 0.0),
             ('home', 0),
             ('is', 0.08333333333333333),
             ('jill', 0.0),
             ('not', 0.16666666666666666),
             ('store', 0),
             ('than', 0),
             ('the', 0),
             ('to', 0),
             ('would', 0)])

In [122]:
# TFIDF with sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = docs 
vectorizer = TfidfVectorizer()
model = vectorizer.fit_transform(corpus)
model

<3x16 sparse matrix of type '<class 'numpy.float64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [126]:
model.toarray()

array([[0.1614879 , 0.        , 0.48446369, 0.21233718, 0.21233718,
        0.        , 0.25081952, 0.21233718, 0.        , 0.        ,
        0.        , 0.21233718, 0.        , 0.63701154, 0.21233718,
        0.21233718],
       [0.36930805, 0.        , 0.36930805, 0.        , 0.        ,
        0.36930805, 0.28680065, 0.        , 0.36930805, 0.36930805,
        0.        , 0.        , 0.48559571, 0.        , 0.        ,
        0.        ],
       [0.        , 0.75143242, 0.        , 0.        , 0.        ,
        0.28574186, 0.22190405, 0.        , 0.28574186, 0.28574186,
        0.37571621, 0.        , 0.        , 0.        , 0.        ,
        0.        ]])

In [128]:
topic = {}

tfidf = dict(list(zip('cat dog apple lion NYC love'.split(),np.random.rand(6))))

In [129]:
tfidf

{'cat': 0.8928601514360016,
 'dog': 0.3319798053011772,
 'apple': 0.8212291230578318,
 'lion': 0.0416966257252499,
 'NYC': 0.10765667993596795,
 'love': 0.5950520642062402}

In [130]:
topic['petness'] = (.3 * tfidf['cat']+\
                   .3 * tfidf['dog']+\
                   0 * tfidf['apple']+\
                   0 * tfidf['lion']-\
                   .2 * tfidf['NYC']+\
                   0.2 * tfidf['love'])
topic['animalness'] = (.1 * tfidf['cat'] +\
                        .1 * tfidf['dog'] -\
                        .1 * tfidf['apple'] +\
                        .5 * tfidf['lion'] +\
                        .1 * tfidf['NYC'] -\
                        .1 * tfidf['love'])
topic['cityness'] = ( 0 * tfidf['cat'] -\
                     .1 * tfidf['dog'] +\
                    .2 * tfidf['apple'] -\
                    .1 * tfidf['lion'] +\
                    .5 * tfidf['NYC'] +\
                    .1 * tfidf['love'])

In [131]:
topic

{'petness': 0.4649310638752081,
 'animalness': 0.012469857803532436,
 'cityness': 0.24021172789753165}