In [2]:
import pandas as pd
import numpy as np
pd.set_option("display.max_colwidth", 500)

# Load some text data - from wikipedia, pages on people

In [3]:
df = pd.read_csv("people_wiki.csv")

In [4]:
df.head(3)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his early senior football for west perth his 44game senior career for the falcons spanned 19982000 and he was the clubs leading goalkicker in 2000 at the age of 21 morrell was recruited to the australian football league by the kangaroos football club with its third round selection in the 2001 afl rookie...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from university of chicago in 1973 after studying psychiatry pharmacology and ophthalmology he is a full professor and vicechair of the department of psychiatry at ohsu oregon health science university and holds an md and phd prior to moving to oregon in 1981 lewy was at the national institute of mental health nimh in bethesda maryland working with senior colleague thomas wehr in oregon he has worked closely with robert l sack as of december 2005 he had...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player who has been active in canadas blues scene since 1982 hailing from vancouver he crossed tens of thousands of miles playing club dates and festivals in canada the northwestern united states and germanyover the years he has issued seven cds in 1995 his home is where the harp is won the muddy award for the best nw blues release from the cascade blues association in portland oregon as well that year it was nominated for a canadian juno for the best ...


In [5]:
len(df)

59071

# Explore the dataset and checkout the text it contains

In [6]:
obama = df[df.name == "Barack Obama"]

obama.text

35817    barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to ...
Name: text, dtype: object

In [7]:
df[df.name == "George Clooney"].text

38514    george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety...
Name: text, dtype: object

# Get the word counts for Obama article

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [9]:
cv = CountVectorizer()
wc = cv.fit_transform(obama.text)

In [10]:
# visualizing the word count
c = Counter({word:wc[0, key] for word, key in cv.vocabulary_.items()})

# Sort the word counts for the Obama article

In [11]:
c.most_common(10)

[('the', 40),
 ('in', 30),
 ('and', 21),
 ('of', 18),
 ('to', 14),
 ('his', 11),
 ('obama', 9),
 ('act', 8),
 ('he', 7),
 ('law', 6)]

# Compute TF-IDF for the corpus

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [13]:
tfidf = TfidfVectorizer(norm=None, 
                        stop_words=stopwords.words("english"))

data = tfidf.fit_transform(df.text)

In [14]:
tfidf_voc_inv = {v:k for k,v in tfidf.vocabulary_.items()}

# Examine the TF-IDF for the Obama article

In [15]:
bobama_ix = df.index.get_indexer_for(df[df.name == "Barack Obama"].index)

In [16]:
c = Counter({tfidf_voc_inv[key]:data[bobama_ix, key][0,0] for key in np.nonzero(data[bobama_ix, :])[1]})

In [17]:
c.most_common(10)

[('obama', 52.277113834307315),
 ('act', 35.674051187909924),
 ('iraq', 21.741727931276476),
 ('law', 20.721855882367674),
 ('control', 18.884330378434285),
 ('us', 17.592044264666814),
 ('ordered', 17.526980051210632),
 ('military', 17.114203144108135),
 ('democratic', 16.409249536745939),
 ('involvement', 15.780836746511332)]

# Manually compute distances between a few people

In [18]:
clinton = df[df.name == "Bill Clinton"]
clinton_ix = df.index.get_indexer_for(clinton.index)

beckham = df[df.name == "David Beckham"]
beckham_ix = df.index.get_indexer_for(beckham.index)

# Is Obama closer to Clinton than to Beckham?

In [19]:
from sklearn.metrics import pairwise_distances

array([[ 0.80338951]])

In [21]:
pairwise_distances(data[bobama_ix, :], data[beckham_ix, :], 
                   metric="cosine")

array([[ 0.97175843]])

# Build a nearest neighbor model for document retrievel

In [22]:
from sklearn.neighbors import NearestNeighbors

In [34]:
tfidf = TfidfVectorizer(stop_words=stopwords.words("english"))

data = tfidf.fit_transform(df.text)

nn = NearestNeighbors().fit(data)

In [35]:
dists, neigh_ixs = nn.kneighbors(data[bobama_ix, :])

pd.DataFrame({"distance": dists[0],
              "names": df.iloc[neigh_ixs[0], df.columns.get_loc("name")]})

Unnamed: 0,distance,names
35817,0.0,Barack Obama
24478,1.156099,Joe Biden
57108,1.206817,Hillary Rodham Clinton
38376,1.207333,Samantha Power
38714,1.221555,Eric Stern (politician)


# Other examples of document retrieval

In [38]:
swift = df[df['name'] == 'Taylor Swift']
swift_ix = df.index.get_indexer_for(swift.index)

dists, neigh_ixs = nn.kneighbors(data[swift_ix, :])

pd.DataFrame({"distance": dists[0],
              "names": df.iloc[neigh_ixs[0], df.columns.get_loc("name")]})

Unnamed: 0,distance,names
54264,0.0,Taylor Swift
317,1.181461,Carrie Underwood
29297,1.191863,Kelly Clarkson
9379,1.193013,Al Swift
25403,1.193043,Ed Sheeran


In [39]:
jolie = df[df['name'] == 'Angelina Jolie']
jolie_ix = df.index.get_indexer_for(jolie.index)

dists, neigh_ixs = nn.kneighbors(data[jolie_ix, :])

pd.DataFrame({"distance": dists[0],
              "names": df.iloc[neigh_ixs[0], df.columns.get_loc("name")]})

Unnamed: 0,distance,names
39521,0.0,Angelina Jolie
24426,1.174771,Brad Pitt
16625,1.242677,Keith Jolie
21644,1.251407,Jodie Foster
34756,1.254485,Maggie Smith


In [53]:
def neighbors(name, data, model):
    
    df_ = df[df['name'] == name]
    df__ix = df.index.get_indexer_for(df_.index)

    dists, neigh_ixs = model.kneighbors(data[df__ix, :])

    return pd.DataFrame({"distance": dists[0],
                  "names": df.iloc[neigh_ixs[0], df.columns.get_loc("name")]})

In [41]:
neighbors("Arnold Schwarzenegger")

Unnamed: 0,distance,names
16018,0.0,Arnold Schwarzenegger
58965,1.259826,Bonnie Garcia
35293,1.264656,Paul Grant (bodybuilder)
47709,1.283801,Gray Davis
8050,1.28498,James Tramel


# Compare top words according to word counts to TF-IDF

In [42]:
elton = df[df.name == "Elton John"]
elton

Unnamed: 0,URI,name,text
19923,<http://dbpedia.org/resource/Elton_John>,Elton John,sir elton hercules john cbe born reginald kenneth dwight 25 march 1947 is an english singer songwriter composer pianist record producer and occasional actor he has worked with lyricist bernie taupin as his songwriter partner since 1967 they have collaborated on more than 30 albums to datein his fivedecade career elton john has sold more than 300 million records making him one of the bestselling music artists in the world he has more than fifty top 40 hits including seven consecutive no 1 us ...


In [44]:
# word count
elton_cv = cv.transform(elton.text)

c = Counter({word:elton_cv[0, key] for word, key in cv.vocabulary_.items()})
c.most_common(10)

[('the', 27),
 ('in', 18),
 ('and', 15),
 ('of', 13),
 ('has', 9),
 ('he', 7),
 ('john', 7),
 ('on', 6),
 ('for', 5),
 ('is', 4)]

In [45]:
# tf-idf
elton_tfidf = tfidf.transform(elton.text)

elton_ix = df.index.get_indexer_for(elton.index)
c = Counter({tfidf_voc_inv[key]:data[elton_ix, key][0,0] for key in np.nonzero(data[elton_ix, :])[1]})

c.most_common(10)

[('billboard', 0.21477858932523641),
 ('john', 0.2111476193944441),
 ('elton', 0.20637394137167683),
 ('furnish', 0.20250188284209453),
 ('songwriters', 0.13352496635258507),
 ('award', 0.13271539385555797),
 ('top', 0.13205867760125098),
 ('since', 0.12379913459911732),
 ('aids', 0.12360302687687517),
 ('million', 0.12153923778690133)]

# Measuring distance

In [46]:
vic = df[df.name == "Victoria Beckham"]
vic_ix = df.index.get_indexer_for(vic.index)

paul = df[df.name == "Paul McCartney"]
paul_ix = df.index.get_indexer_for(paul.index)

In [47]:
pairwise_distances(data[elton_ix, :], data[vic_ix, :], 
                   metric="cosine")

array([[ 0.95172145]])

In [48]:
pairwise_distances(data[elton_ix, :], data[paul_ix, :], 
                   metric="cosine")

array([[ 0.80846944]])

# Building nearest neighbors models with different input features and setting the distance metric

## word count model

In [59]:
cv = CountVectorizer().fit(df.text)
data_cv = cv.transform(df.text)
nn_cv = NearestNeighbors(metric="cosine", algorithm="brute").fit(data_cv)

## tf-idf model

In [60]:
tfidf = TfidfVectorizer().fit(df.text)
data_tfidf = tfidf.transform(df.text)
nn_tfidf = NearestNeighbors(metric="cosine", algorithm="brute").fit(data_tfidf)

In [61]:
neighbors("Elton John", data_cv, nn_cv)

Unnamed: 0,distance,names
19923,2.442491e-15,Elton John
41668,0.1687792,Cliff Richard
25798,0.171841,Sandro Petrone
28825,0.1744907,Rod Stewart
37447,0.184013,Roger Daltrey


In [62]:
neighbors("Elton John", data_tfidf, nn_tfidf)

Unnamed: 0,distance,names
19923,0.0,Elton John
28825,0.589361,Rod Stewart
31595,0.633658,Phil Collins
27793,0.636524,Adele
26049,0.642397,Sting (musician)


In [63]:
neighbors("Victoria Beckham", data_cv, nn_cv)

Unnamed: 0,distance,names
50411,3.330669e-16,Victoria Beckham
669,0.2115428,Mary Fitzgerald (artist)
45129,0.2185431,Adrienne Corri
39504,0.2218932,Beverly Jane Fry
13937,0.2224486,Raman Mundair


In [64]:
neighbors("Victoria Beckham", data_tfidf, nn_tfidf)

Unnamed: 0,distance,names
50411,0.0,Victoria Beckham
23386,0.546477,David Beckham
17264,0.718422,Mel B
39144,0.745956,Stephen Dow Beckham
5385,0.751848,Hilary Alexander
