In [1]:
import pandas as pd 
import numpy as np 

In [2]:
people_df = pd.read_csv('../GitHub_data/people_wiki/people_wiki.csv')
people_df.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
text_tfidf = tfidf_vectorizer.fit_transform(people_df['text'])

In [5]:
import scipy

In [6]:
scipy.sparse.linalg.norm(text_tfidf , axis=1)

array([1., 1., 1., ..., 1., 1., 1.])

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv = CountVectorizer(stop_words='english')
text_cv = cv.fit_transform(people_df['text'])

In [9]:
non_zero_text_cv = 1*(text_cv > 0)

In [10]:
idf = np.log((non_zero_text_cv.shape[0]) / np.sum(non_zero_text_cv , axis=0))

In [11]:
text_tfidf_nn = text_cv.multiply(idf)

In [12]:
text_tfidf_nn.shape

(59071, 548115)

In [13]:
scipy.sparse.linalg.norm(text_tfidf_nn , axis=1)

array([77.38465859, 96.90653   , 68.5970974 , ..., 77.2497272 ,
       70.39147365, 75.75702447])

In [14]:
type(text_tfidf_nn)

scipy.sparse.coo.coo_matrix

In [15]:
# Create a KNN model
from sklearn.neighbors import KNeighborsClassifier

In [16]:
model_euclidean = KNeighborsClassifier(metric='euclidean')
model_euclidean.fit(text_tfidf_nn,people_df['name'])

KNeighborsClassifier(metric='euclidean')

In [17]:
# Find top 10 neighbors of Barack Obama

# Find Barack Obama index
people_df[people_df['name'] == 'Barack Obama']

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [18]:
# Find Barack Obama sparse matrix form
obama_tfidf_nn = text_tfidf_nn.tocsr()[35817,:]

In [19]:
euc_neighbor_dist = model_euclidean.kneighbors(obama_tfidf_nn,n_neighbors=10)[0][0]
euc_neighbor_index = model_euclidean.kneighbors(obama_tfidf_nn,n_neighbors=10)[1][0]

euc_neighbors = pd.DataFrame({'distance' : euc_neighbor_dist , 'index' : euc_neighbor_index})

euc_neighbors = pd.merge(euc_neighbors , people_df['name'].reset_index() , on='index' , how='left')
euc_neighbors


Unnamed: 0,distance,index,name
0,0.0,35817,Barack Obama
1,105.491028,7914,Phil Schiliro
2,107.400537,38376,Samantha Power
3,107.424469,44681,Jesse Lee (politician)
4,107.865568,46811,Jeff Sessions
5,108.383341,6507,Bob Menendez
6,108.407927,33417,Tulsi Gabbard
7,108.76033,44368,Roland Grossenbacher
8,108.910817,38714,Eric Stern (politician)
9,108.96997,49944,Howard Dawson


In [20]:
euc_obama_neighbors = people_df[people_df['name'].isin(euc_neighbors['name'])].copy()

In [21]:
def find_length(row):
    return len(row.split(' '))
euc_obama_neighbors['text_len'] = euc_obama_neighbors['text'].apply(find_length)
euc_obama_neighbors

Unnamed: 0,URI,name,text,text_len
6507,<http://dbpedia.org/resource/Bob_Menendez>,Bob Menendez,robert bob menendez born january 1 1954 is the...,220
7914,<http://dbpedia.org/resource/Phil_Schiliro>,Phil Schiliro,phil schiliro is an american political consult...,208
33417,<http://dbpedia.org/resource/Tulsi_Gabbard>,Tulsi Gabbard,tulsi gabbard born april 12 1981 is an america...,228
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,540
38376,<http://dbpedia.org/resource/Samantha_Power>,Samantha Power,samantha power born september 21 1970 is an ir...,310
38714,<http://dbpedia.org/resource/Eric_Stern_(polit...,Eric Stern (politician),eric stern is the director of operations for t...,255
44368,<http://dbpedia.org/resource/Roland_Grossenbac...,Roland Grossenbacher,dr roland edouard grossenbacher is a swiss att...,201
44681,<http://dbpedia.org/resource/Jesse_Lee_(politi...,Jesse Lee (politician),jesse lee born 1979 was named the white house ...,216
46811,<http://dbpedia.org/resource/Jeff_Sessions>,Jeff Sessions,jefferson beauregard jeff sessions iii born de...,230
49944,<http://dbpedia.org/resource/Howard_Dawson>,Howard Dawson,howard a dawson jr born october 23 1922 in oko...,203


In [22]:
euc_obama_neighbors['text_len'].mean()

261.1

In [31]:
euc_obama_neighbors[euc_obama_neighbors['name']!='Barack Obama']['text_len'].mean()

230.11111111111111

In [23]:
model_euclidean.fit(text_tfidf,people_df['name'])

KNeighborsClassifier(metric='euclidean')

In [24]:
# Find top 10 neighbors of Barack Obama

# Represent Barack Obama into sparse matrix form
obama_tfidf = tfidf_vectorizer.transform(people_df[people_df['name'] == 'Barack Obama']['text'])

In [25]:
neighbor_dist = model_euclidean.kneighbors(obama_tfidf,n_neighbors=10)[0][0]
neighbor_index = model_euclidean.kneighbors(obama_tfidf,n_neighbors=10)[1][0]

In [26]:
neighbors = pd.DataFrame({'distance' : neighbor_dist , 'index' : neighbor_index})

In [27]:
neighbors = pd.merge(neighbors , people_df['name'].reset_index() , on='index' , how='left')
neighbors

Unnamed: 0,distance,index,name
0,0.0,35817,Barack Obama
1,1.165145,24478,Joe Biden
2,1.207369,38376,Samantha Power
3,1.21964,57108,Hillary Rodham Clinton
4,1.222509,38714,Eric Stern (politician)
5,1.236178,46140,Robert Gibbs
6,1.243057,18827,Henry Waxman
7,1.244667,44681,Jesse Lee (politician)
8,1.248296,6796,Eric Holder
9,1.251607,2412,Joe the Plumber


In [28]:
obama_neighbors = people_df[people_df['name'].isin(neighbors['name'])].copy()

In [29]:
obama_neighbors['text_len'] = obama_neighbors['text'].apply(find_length)
obama_neighbors

Unnamed: 0,URI,name,text,text_len
2412,<http://dbpedia.org/resource/Joe_the_Plumber>,Joe the Plumber,samuel joseph wurzelbacher wrzlbkr born decemb...,217
6796,<http://dbpedia.org/resource/Eric_Holder>,Eric Holder,eric himpton holder jr born january 21 1951 is...,232
18827,<http://dbpedia.org/resource/Henry_Waxman>,Henry Waxman,henry arnold waxman born september 12 1939 is ...,279
24478,<http://dbpedia.org/resource/Joe_Biden>,Joe Biden,joseph robinette joe biden jr dosf rbnt badn b...,414
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,540
38376,<http://dbpedia.org/resource/Samantha_Power>,Samantha Power,samantha power born september 21 1970 is an ir...,310
38714,<http://dbpedia.org/resource/Eric_Stern_(polit...,Eric Stern (politician),eric stern is the director of operations for t...,255
44681,<http://dbpedia.org/resource/Jesse_Lee_(politi...,Jesse Lee (politician),jesse lee born 1979 was named the white house ...,216
46140,<http://dbpedia.org/resource/Robert_Gibbs>,Robert Gibbs,robert lane gibbs born march 29 1971 is an ame...,257
57108,<http://dbpedia.org/resource/Hillary_Rodham_Cl...,Hillary Rodham Clinton,hillary diane rodham clinton hlri dan rdm klnt...,580


In [30]:
obama_neighbors['text_len'].mean()

330.0

In [32]:
obama_neighbors[obama_neighbors['name']!='Barack Obama']['text_len'].mean()

306.6666666666667