<a href="https://colab.research.google.com/github/baut-jc/ddds-c18/blob/lectures/Lectures/5-3c_K_Nearest_Neighbors_end.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd

from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

pd.options.display.max_columns = 100

import nltk
# nltk.download('omw-1.4')
nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

### Gary Example


In [None]:
my_df = pd.DataFrame()
my_df["names"] = ['Amantha', 'Brendon', 'Nate', 'Sam', 'Betty', 'Christine', 'Gin', 'Ken', 'Susy']
my_df["ages"] = [ 19, 23, 24, 30, 16, 18, 22, 18, 15 ]
my_df["genders_txt"] = "female male male male female female female male female".split()
my_df["genders"] = [ 1, 0, 0, 0, 1, 1, 1, 0, 1 ]
my_df["music_band_txt"] = "Coldplay Coldplay LinkinPark LinkinPark Coldplay LinkinPark LinkinPark Coldplay Coldplay".split()

my_df


Unnamed: 0,names,ages,genders_txt,genders,music_band_txt
0,Amantha,19,female,1,Coldplay
1,Brendon,23,male,0,Coldplay
2,Nate,24,male,0,LinkinPark
3,Sam,30,male,0,LinkinPark
4,Betty,16,female,1,Coldplay
5,Christine,18,female,1,LinkinPark
6,Gin,22,female,1,LinkinPark
7,Ken,18,male,0,Coldplay
8,Susy,15,female,1,Coldplay


In [None]:
my_df.select_dtypes("int")

Unnamed: 0,ages,genders
0,19,1
1,23,0
2,24,0
3,30,0
4,16,1
5,18,1
6,22,1
7,18,0
8,15,1


Fit nearest neighbors


In [None]:
nn = NearestNeighbors().fit(my_df.select_dtypes("int"))


Get nearest neighbors distances


In [None]:
gary = pd.DataFrame( {"ages": [23], "genders": [0] } )
gary

Unnamed: 0,ages,genders
0,23,0


In [None]:
distances, indices = nn.kneighbors(
  X = gary,
  n_neighbors = 3,
)


In [None]:
distances[0]**2


array([0., 1., 2.])

In [None]:
indices[0]


array([1, 2, 6])

Get people matching index


In [None]:
my_df.iloc[indices[0]]


Unnamed: 0,names,ages,genders_txt,genders,music_band_txt
1,Brendon,23,male,0,Coldplay
2,Nate,24,male,0,LinkinPark
6,Gin,22,female,1,LinkinPark


Vote

In [None]:
my_df.iloc[indices[0]]["music_band_txt"].mode()[0]

'LinkinPark'

Repeat with K = all rows

In [None]:
d_i = nn.kneighbors(gary, n_neighbors = my_df.shape[0])
distances, indices = np.array(d_i).reshape(2,9)
distances**2, indices


(array([ 0.,  1.,  2., 17., 25., 26., 49., 50., 65.]),
 array([1., 2., 6., 0., 7., 5., 3., 4., 8.]))

In [None]:
(
  my_df
    .iloc[indices]
    .join( pd.DataFrame( { "distances^2": distances**2 }, index = indices ) )
)

Unnamed: 0,names,ages,genders_txt,genders,music_band_txt,distances^2
1.0,Brendon,23,male,0,Coldplay,0.0
2.0,Nate,24,male,0,LinkinPark,1.0
6.0,Gin,22,female,1,LinkinPark,2.0
0.0,Amantha,19,female,1,Coldplay,17.0
7.0,Ken,18,male,0,Coldplay,25.0
5.0,Christine,18,female,1,LinkinPark,26.0
3.0,Sam,30,male,0,LinkinPark,49.0
4.0,Betty,16,female,1,Coldplay,50.0
8.0,Susy,15,female,1,Coldplay,65.0


Display vote for various values of K $\epsilon$ { 1, 3, 5, 7, 9 }

In [None]:
for k in range(1,10,2):
  vote = my_df.iloc[indices]["music_band_txt"][:k].mode()[0]
  print(f"K = {k} : {vote}")


K = 1 : Coldplay
K = 3 : LinkinPark
K = 5 : Coldplay
K = 7 : LinkinPark
K = 9 : Coldplay


# NLP

If our text data are unlabelled (as is often the case in NLP), we can use KNN to identify documents that are similar to a given document.  In this example, our documents will be sentences and the given document will be the first sentence.

In [None]:
%%capture
!python -m textblob.download_corpora


In [None]:
sentences_orig = [
  'Jen is a good student.',
  'Jen is also a great guitarist.',
  'Good students can sometimes be good guitarists',
]
sentences_orig


['Jen is a good student.',
 'Jen is also a great guitarist.',
 'Good students can sometimes be good guitarists']

# Data Cleaning
We want to singularize guitarists and students.

In [None]:
sentence_last_tb = TextBlob(sentences_orig[-1]) # Make a textblob so that we can singularize the word
sentence_last_singular = [ x.singularize() for x in sentence_last_tb.words ] # Singularize each word in the text
sentence_last_clean = ' '.join(sentence_last_singular) # Join it together into a single string
sentence_last_clean


'Good student can sometime be good guitarist'

In [None]:
sentences_clean = sentences_orig[:2] + [sentence_last_clean]
sentences_clean

['Jen is a good student.',
 'Jen is also a great guitarist.',
 'Good student can sometime be good guitarist']

## Bag of Words Using CountVectorizer

Perform the count transformation


In [None]:
vectorizer = CountVectorizer(stop_words='english')
bow_matrix = vectorizer.fit_transform(sentences_clean)


In [None]:
type(bow_matrix), bow_matrix.shape


(scipy.sparse._csr.csr_matrix, (3, 5))

In [None]:
bow_matrix.toarray()


array([[1, 0, 0, 1, 1],
       [0, 1, 1, 1, 0],
       [2, 0, 1, 0, 1]])

## TF-IDF using BoW


Perform the TF-IDF transformation


In [None]:
tf_idf_matrix = TfidfTransformer()
tf_idf_jen = tf_idf_matrix.fit_transform(bow_matrix)


In [None]:
type(tf_idf_jen), tf_idf_jen.shape


(scipy.sparse._csr.csr_matrix, (3, 5))

In [None]:
tf_idf_jen.toarray()


array([[0.57735027, 0.        , 0.        , 0.57735027, 0.57735027],
       [0.        , 0.68091856, 0.51785612, 0.51785612, 0.        ],
       [0.81649658, 0.        , 0.40824829, 0.        , 0.40824829]])

Print out results in a dataframe


In [None]:
tf_df = pd.DataFrame(
  data = tf_idf_jen.toarray(),
  columns = vectorizer.get_feature_names_out(),
)
tf_df


Unnamed: 0,good,great,guitarist,jen,student
0,0.57735,0.0,0.0,0.57735,0.57735
1,0.0,0.680919,0.517856,0.517856,0.0
2,0.816497,0.0,0.408248,0.0,0.408248


> Note: Converting a sparse matrix to a data frame is NOT something you will normally do, especially for large matrices.

## K Nearest Neighbors

Fit nearest neighbors


In [None]:
nn = NearestNeighbors().fit(tf_idf_jen)


Create the reference matrix from the tf_idf matrix


In [None]:
sent0 = tf_idf_jen[0]
sent0.shape

(1, 5)

Or ...

Create the reference matrix from the data frame


In [None]:
sent0 = np.array([tf_df.iloc[0]])
sent0.shape

(1, 5)

Get nearest neighbors distances


In [None]:
distances, indices = nn.kneighbors(
  X = sent0,
  n_neighbors = 2,
)


In [None]:
distances


array([[0.        , 0.76536686]])

In [None]:
indices


array([[0, 2]])

Pull out the original sentences given the indices.

In [None]:
# Using list comprehension
[ x for i,x in enumerate(sentences_orig) if i in indices[0] ]

['Jen is a good student.', 'Good students can sometimes be good guitarists']

In [None]:
# Converting to Numpy array
np.array(sentences_orig)[indices]


array([['Jen is a good student.',
        'Good students can sometimes be good guitarists']], dtype='<U46')

# Another Example - Using Wikipedia API

## Get text and clean

Install Wikipedia API

In [None]:
%%capture
!pip3 install wikipedia-api

In [None]:
import wikipediaapi

Pull out page from Wikipedia


In [None]:
# https://en.wikipedia.org/wiki/Munchkin
topic = 'munchkin'
wikip = wikipediaapi.Wikipedia('foobar')
page_ex = wikip.page(topic)
wiki_text = page_ex.text
wiki_text


'A Munchkin is a native of the fictional Munchkin Country in the Oz books by American author L. Frank Baum. Although a common fixture in Germanic fairy tales, they are introduced to modern audiences with the first appearance in the classic children\'s novel The Wonderful Wizard of Oz (1900) where they welcome Dorothy Gale to their city in Oz. The Munchkins are described as being the same height as Dorothy and they wear only shades of blue clothing, as blue is the Munchkins\' favorite color. Blue is also the predominating color that officially represents the eastern quadrant in the Land of Oz. The Munchkins have appeared in various media, including the 1939 film The Wizard of Oz, as well as in various other films and comedy acts.\n\nConcept\nWhile Baum may have written about it, there are no surviving notes for the composition of The Wonderful Wizard of Oz. The lack of this information has resulted in speculation of the term origins he used in the book, which include the word Munchkin. 

Replace newline chars with spaces before doing any processing. Strip the ' and "s" from possessives.


In [None]:
# took the text and cleaned.

wiki_text_clean = (
  wiki_text
  .lower()
  .replace("\n"," ")
  .replace("\'s",'')
  .replace('\'','')
  .replace("(", "")
  .replace(")", "")
  .replace('"', "")
)
wiki_text_clean


'a munchkin is a native of the fictional munchkin country in the oz books by american author l. frank baum. although a common fixture in germanic fairy tales, they are introduced to modern audiences with the first appearance in the classic children novel the wonderful wizard of oz 1900 where they welcome dorothy gale to their city in oz. the munchkins are described as being the same height as dorothy and they wear only shades of blue clothing, as blue is the munchkins favorite color. blue is also the predominating color that officially represents the eastern quadrant in the land of oz. the munchkins have appeared in various media, including the 1939 film the wizard of oz, as well as in various other films and comedy acts.  concept while baum may have written about it, there are no surviving notes for the composition of the wonderful wizard of oz. the lack of this information has resulted in speculation of the term origins he used in the book, which include the word munchkin. baum resea

Convert to textblob

In [None]:
wiki_blob = TextBlob(wiki_text_clean)
wiki_blob

TextBlob("a munchkin is a native of the fictional munchkin country in the oz books by american author l. frank baum. although a common fixture in germanic fairy tales, they are introduced to modern audiences with the first appearance in the classic children novel the wonderful wizard of oz 1900 where they welcome dorothy gale to their city in oz. the munchkins are described as being the same height as dorothy and they wear only shades of blue clothing, as blue is the munchkins favorite color. blue is also the predominating color that officially represents the eastern quadrant in the land of oz. the munchkins have appeared in various media, including the 1939 film the wizard of oz, as well as in various other films and comedy acts.  concept while baum may have written about it, there are no surviving notes for the composition of the wonderful wizard of oz. the lack of this information has resulted in speculation of the term origins he used in the book, which include the word munchkin. b

Only look at first 20 sentences


In [None]:
len(wiki_blob.sentences)

109

In [None]:
my_sentences = wiki_blob.sentences[0:20]
my_sentences


[Sentence("a munchkin is a native of the fictional munchkin country in the oz books by american author l. frank baum."),
 Sentence("although a common fixture in germanic fairy tales, they are introduced to modern audiences with the first appearance in the classic children novel the wonderful wizard of oz 1900 where they welcome dorothy gale to their city in oz."),
 Sentence("the munchkins are described as being the same height as dorothy and they wear only shades of blue clothing, as blue is the munchkins favorite color."),
 Sentence("blue is also the predominating color that officially represents the eastern quadrant in the land of oz."),
 Sentence("the munchkins have appeared in various media, including the 1939 film the wizard of oz, as well as in various other films and comedy acts."),
 Sentence("concept while baum may have written about it, there are no surviving notes for the composition of the wonderful wizard of oz."),
 Sentence("the lack of this information has resulted in spe

Singularize and convert back to string


In [None]:
for i, sentence in enumerate(my_sentences):
  sing = [x.singularize() for x in sentence.words]
  my_sentences[i] = ' '.join(sing)
my_sentences


['a munchkin is a native of the fictional munchkin country in the oz book by american author l frank baum',
 'although a common fixture in germanic fairy tale they are introduced to modern audience with the first appearance in the classic child novel the wonderful wizard of oz 1900 where they welcome dorothy gale to their city in oz',
 'the munchkin are described a being the same height a dorothy and they wear only shade of blue clothing a blue is the munchkin favorite color',
 'blue is also the predominating color that officially represent the eastern quadrant in the land of oz',
 'the munchkin have appeared in variou medium including the 1939 film the wizard of oz a well a in variou other film and comedy act',
 'concept while baum may have written about it there are no surviving note for the composition of the wonderful wizard of oz',
 'the lack of thi information ha resulted in speculation of the term origin he used in the book which include the word munchkin',
 'baum researcher bri

## TF-IDF without using BoW

Perform the TF-IDF Vectorization


In [None]:
tf_idf_matrix = TfidfVectorizer(stop_words = 'english')
tf_idf = tf_idf_matrix.fit_transform(my_sentences)


In [None]:
tf_idf.shape

(20, 173)

Print out results in a data frame


In [None]:
pd.set_option('display.max_rows', 200)

results_df = pd.DataFrame(
  data = tf_idf.toarray(),
  columns = tf_idf_matrix.get_feature_names_out()
)
results_df.transpose()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
13th,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.307943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1900,0.0,0.206757,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282341,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1939,0.0,0.0,0.0,0.0,0.256986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
act,0.0,0.0,0.0,0.0,0.256986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
american,0.361256,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
appearance,0.0,0.235214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
appeared,0.0,0.0,0.0,0.0,0.256986,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
attebery,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.266199,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
attend,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.282903
attracting,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.321201,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## K Nearest Neighbors

Fit nearest neighbors


In [None]:
nn = NearestNeighbors().fit(tf_idf)


Get nearest neighbors distances to first sentence


In [None]:
distances, indices = nn.kneighbors(
  X = tf_idf[0],
  n_neighbors = 6,
)


In [None]:
distances


array([[0.        , 1.14391441, 1.26882977, 1.32678985, 1.32992975,
        1.33659313]])

In [None]:
indices


In [None]:
np.array(my_sentences)[indices] # after cleaning the sentence from; similar to the bottom cell block.


In [None]:
np.array([ str(x) for x in wiki_blob.sentences[0:20] ] )[indices] #refers to the original sentences based on indices.