In [None]:
#!pip install nlpia
import nlpia
from nlpia.data.loaders import kite_text, kite_history

In [None]:
kite_text

"A kite is traditionally a tethered heavier-than-air craft with wing surfaces that react against the air to create lift and drag. A kite consists of wings, tethers, and anchors. Kites often have a bridle to guide the face of the kite at the correct angle so the wind can lift it. A kite's wing also may be so designed so a bridle is not needed; when kiting a sailplane for launch, the tether meets the wing at a single point. A kite may have fixed or moving anchors. Untraditionally in technical kiting, a kite consists of tether-set-coupled wing sets; even in technical kiting, though, a wing in the system is still often called the kite.\n\nThe lift that sustains the kite in flight is generated when air flows around the kite's surface, producing low pressure above and high pressure below the wings. The interaction with the wind also generates horizontal drag along the direction of the wind. The resultant force vector from the lift and drag force components is opposed by the tension of one or

In [None]:
kite_history

'Kites were invented in China, where materials ideal for kite building were readily available: silk fabric for sail material; fine, high-tensile-strength silk for flying line; and resilient bamboo for a strong, lightweight framework.\n\nThe kite has been claimed as the invention of the 5th-century BC Chinese philosophers Mozi (also Mo Di) and Lu Ban (also Gongshu Ban). By 549 AD paper kites were certainly being flown, as it was recorded that in that year a paper kite was used as a message for a rescue mission. Ancient and medieval Chinese sources describe kites being used for measuring distances, testing the wind, lifting men, signaling, and communication for military operations. The earliest known Chinese kites were flat (not bowed) and often rectangular. Later, tailless kites incorporated a stabilizing bowline. Kites were decorated with mythological motifs and legendary figures; some were fitted with strings and whistles to make musical sounds while flying. From China, kites were int

Text Data pre-processing: 

In [None]:
#kite_text data pre-processing:
kite_intro = kite_text.lower()

from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
intro_tokens = tokenizer.tokenize(kite_intro)
intro_total = len(intro_tokens)
print(intro_tokens[:10])


#similarly kite_history data pre-processing:
kite_history = kite_history.lower()

history_tokens = tokenizer.tokenize(kite_history)
history_total = len(history_tokens)
print(history_tokens[:10])


['a', 'kite', 'is', 'traditionally', 'a', 'tethered', 'heavier-than-air', 'craft', 'with', 'wing']
['kites', 'were', 'invented', 'in', 'china', ',', 'where', 'materials', 'ideal', 'for']


TF-IDF Application:

In [None]:
from collections import Counter

# term frequency
intro_tf = {}
history_tf = {}
intro_counts = Counter(intro_tokens)       #freq counts
history_counts = Counter(history_tokens)   #freq counts

print(intro_counts)
print(history_counts)

Counter({'the': 26, 'a': 20, 'kite': 16, ',': 15, 'and': 10, 'of': 10, 'kites': 8, 'is': 7, 'in': 7, 'or': 6, 'wing': 5, 'to': 5, 'be': 5, 'as': 5, 'lift': 4, 'have': 4, 'may': 4, 'at': 3, 'so': 3, 'can': 3, 'also': 3, 'kiting': 3, 'are': 3, 'flown': 3, 'tethered': 2, 'craft': 2, 'with': 2, 'that': 2, 'air': 2, 'consists': 2, 'tethers': 2, 'anchors.': 2, 'often': 2, 'bridle': 2, 'wind': 2, "'s": 2, 'designed': 2, ';': 2, 'when': 2, 'for': 2, 'moving': 2, 'technical': 2, 'even': 2, 'called': 2, 'surface': 2, 'pressure': 2, 'drag': 2, 'force': 2, 'by': 2, 'which': 2, '.': 2, 'used': 2, 'power': 2, 'traditionally': 1, 'heavier-than-air': 1, 'surfaces': 1, 'react': 1, 'against': 1, 'create': 1, 'drag.': 1, 'wings': 1, 'guide': 1, 'face': 1, 'correct': 1, 'angle': 1, 'it.': 1, 'not': 1, 'needed': 1, 'sailplane': 1, 'launch': 1, 'tether': 1, 'meets': 1, 'single': 1, 'point.': 1, 'fixed': 1, 'untraditionally': 1, 'tether-set-coupled': 1, 'sets': 1, 'though': 1, 'system': 1, 'still': 1, 'kite.

- step1: calulacte TF for word 'kite'
- step2: calulacte IDF for word 'kite'
- step3: calulacte TF-IDF for word 'kite'

In [None]:
#calculate tf for the word 'kite' in both the texts into and history:
intro_tf['kite'] = intro_counts['kite']/intro_total
print(intro_tf)

history_tf['kite'] = history_counts['kite']/history_total
print(history_tf)

{'kite': 0.0440771349862259}
{'kite': 0.020202020202020204}


In [None]:
#IDF - Inverse document frequency calculation:
#calculate IDF for the word 'kite'

total_num_docs = 2  #as there are 2 docs only, kite_text & kite_history

num_docs_containing_kite = 0
for doc in [intro_tokens, history_tokens]:
    if 'kite' in doc:
        num_docs_containing_kite += 1
print(num_docs_containing_kite)
#this means both history_tokens and intro_tokens contains the word 'kite'

intro_idf = {}
history_idf = {}
intro_idf['kite'] = total_num_docs/num_docs_containing_kite
history_idf['kite'] = total_num_docs/num_docs_containing_kite
print(intro_idf, history_idf)


2
{'kite': 1.0} {'kite': 1.0}


So in bo the docs the IDF of 'kite' word is = 1.0

In [None]:
#TF-IDF calculation:
intro_tfidf = {}
intro_tfidf['kite'] = intro_tf['kite'] * intro_idf['kite']
intro_tfidf

{'kite': 0.0440771349862259}

- Use of TfidfVectorizer on a corpus:

In [None]:
docs = ['the faster Harry got to the store, the faster and faster Harry would get home']
docs.append('Harry is hairy and faster than jill')
docs.append('jill is not as hairy as Harry')

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = docs

vectorizer = TfidfVectorizer()
model = vectorizer.fit_transform(corpus)
model

<3x16 sparse matrix of type '<class 'numpy.float64'>'
	with 23 stored elements in Compressed Sparse Row format>

In [None]:
print(model.todense().round(2))

[[0.16 0.   0.48 0.21 0.21 0.   0.25 0.21 0.   0.   0.   0.21 0.   0.64
  0.21 0.21]
 [0.37 0.   0.37 0.   0.   0.37 0.29 0.   0.37 0.37 0.   0.   0.49 0.
  0.   0.  ]
 [0.   0.75 0.   0.   0.   0.29 0.22 0.   0.29 0.29 0.38 0.   0.   0.
  0.   0.  ]]


- So we have converted each word in the sentence to the number
- here, the rarer word will have higher tfidf value

### Apply TfidfVectorizer on a dataset:

In [None]:
import pandas as pd
import numpy as np
from google.colab import files
uploaded = files.upload()
df = pd.read_csv('moviereviews.tsv',sep='\t')
df.head()

Saving moviereviews.tsv to moviereviews.tsv


Unnamed: 0,label,review
0,neg,how do films like mouse hunt get into theatres...
1,neg,some talented actresses are blessed with a dem...
2,pos,this has been an extraordinary year for austra...
3,pos,according to hollywood movies made in last few...
4,neg,my first press screening of 1998 and already i...


In [None]:
df.isna().sum()

label      0
review    35
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
df['label'].value_counts()

neg    983
pos    982
Name: label, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

x = df['review']
y = df['label']

X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.33,random_state=42)

#### sklearn pipelines code:

- from sklearn.feature_extraction.text import TfidfVectorizer
- vectorizer = TfidfVectorizer()
- model = vectorizer.fit_transform(X_train,y_train)
- from sklearn.linear_model import LogisticRegression
- lr = LogisticRegression()
- lr.fit(model)

In [None]:
# But instead of doing all the above part we can just do using pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

text_clf_nb = Pipeline([('tfidf',TfidfVectorizer()),('clf',MultinomialNB())])
text_clf_nb.fit(X_train,y_train)

#make predictions:
predctions = text_clf_nb.predict(X_test)
print(predctions[:10])

['neg' 'neg' 'neg' 'neg' 'neg' 'pos' 'neg' 'neg' 'neg' 'neg']


In [None]:
pd.Series(predctions).value_counts()

neg    417
pos    232
dtype: int64

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test,predctions))
print(confusion_matrix(y_test,predctions))

              precision    recall  f1-score   support

         neg       0.73      0.94      0.82       322
         pos       0.92      0.65      0.76       327

    accuracy                           0.80       649
   macro avg       0.82      0.80      0.79       649
weighted avg       0.82      0.80      0.79       649

[[303  19]
 [114 213]]
