# Document retrieval from wikipedia data

In [1]:
import pandas as pd
import numpy as np

# Load some text data - from wikipedia, pages on people

In [2]:
people = pd.read_csv('people_wiki.csv')

Data contains:  link to wikipedia article, name of person, text of article.

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
len(people)

59071

# Explore the dataset and checkout the text it contains

## Exploring the entry for president Obama

In [5]:
obama = people[people['name'] == 'Barack Obama']

In [6]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [7]:
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

## Exploring the entry for actor George Clooney

In [8]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

# Get the word counts for Obama article

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
count_vectorizer = CountVectorizer()
count = count_vectorizer.fit_transform(people['text'])

In [11]:
analyzer = count_vectorizer.build_analyzer()

In [12]:
voc = count_vectorizer.vocabulary_
rvoc = dict(zip(voc.itervalues(),voc.iterkeys()))


In [13]:
def count_words(doc):
    dic = {}
    if analyzer(doc):
        M = count_vectorizer.fit_transform([doc]).toarray()[0]
        for word,index in count_vectorizer.vocabulary_.items():
            dic[word] = M[index]
    return dic

In [14]:
obama['word_count'] = obama['text'].apply(count_words)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
obama

Unnamed: 0,URI,name,text,word_count
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...,"{u'operations': 1, u'represent': 1, u'peace': ..."


In [16]:
obama['word_count'].values[0]

{u'13th': 1,
 u'1961': 1,
 u'1992': 1,
 u'1996': 1,
 u'1997': 1,
 u'20': 2,
 u'2000in': 1,
 u'2004': 3,
 u'2007': 1,
 u'2008': 1,
 u'2009': 3,
 u'2010': 2,
 u'2011': 3,
 u'2012': 1,
 u'2012obama': 1,
 u'2013': 1,
 u'44th': 1,
 u'63': 1,
 u'act': 8,
 u'address': 1,
 u'administration': 1,
 u'affordable': 1,
 u'afghanistan': 2,
 u'african': 1,
 u'after': 4,
 u'against': 1,
 u'american': 3,
 u'americans': 1,
 u'and': 21,
 u'arms': 1,
 u'as': 6,
 u'ask': 1,
 u'at': 2,
 u'attention': 1,
 u'attorney': 1,
 u'august': 1,
 u'barack': 1,
 u'before': 1,
 u'began': 1,
 u'bin': 1,
 u'bm': 1,
 u'born': 2,
 u'briefs': 1,
 u'brk': 1,
 u'budget': 1,
 u'by': 1,
 u'californias': 1,
 u'called': 1,
 u'campaign': 3,
 u'care': 1,
 u'chicago': 2,
 u'civil': 1,
 u'clinton': 1,
 u'close': 1,
 u'columbia': 1,
 u'combat': 1,
 u'community': 1,
 u'constitutional': 1,
 u'consumer': 1,
 u'continued': 1,
 u'control': 4,
 u'convention': 1,
 u'court': 1,
 u'creation': 1,
 u'cuba': 1,
 u'current': 1,
 u'death': 1,
 u'deba

In [17]:
obama['word_count'].values[0].items()

[(u'operations', 1),
 (u'represent', 1),
 (u'peace', 1),
 (u'office', 2),
 (u'unemployment', 1),
 (u'doddfrank', 1),
 (u'over', 1),
 (u'unconstitutional', 1),
 (u'domestic', 2),
 (u'major', 1),
 (u'ending', 1),
 (u'ended', 1),
 (u'proposition', 1),
 (u'seats', 1),
 (u'graduate', 1),
 (u'debate', 1),
 (u'before', 1),
 (u'death', 1),
 (u'20', 2),
 (u'taxpayer', 1),
 (u'with', 3),
 (u'obamacare', 1),
 (u'civil', 1),
 (u'2009', 3),
 (u'barack', 1),
 (u'to', 14),
 (u'policy', 2),
 (u'marriage', 1),
 (u'has', 4),
 (u'2011', 3),
 (u'2010', 2),
 (u'2013', 1),
 (u'2012', 1),
 (u'bin', 1),
 (u'then', 1),
 (u'13th', 1),
 (u'his', 11),
 (u'march', 1),
 (u'gains', 1),
 (u'cuba', 1),
 (u'school', 3),
 (u'primary', 2),
 (u'made', 1),
 (u'not', 1),
 (u'during', 2),
 (u'years', 1),
 (u'continued', 1),
 (u'presidential', 2),
 (u'husen', 1),
 (u'down', 1),
 (u'californias', 1),
 (u'equality', 1),
 (u'prize', 1),
 (u'lost', 1),
 (u'called', 1),
 (u'stimulus', 1),
 (u'january', 3),
 (u'university', 2),
 (u

## Sort the word counts for the Obama article

### Turning dictonary of word counts into a table

In [18]:
obama_word_count_table = pd.DataFrame(obama['word_count'].values[0].items(),columns=['word','count'])

### Sorting the word counts to show most common words at the top

In [19]:
obama_word_count_table.sort_values(by='count',ascending=False).head()

Unnamed: 0,word,count
266,the,40
220,in,30
138,and,21
126,of,18
25,to,14


Most common words include uninformative words like "the", "in", "and",...

# Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer

In [21]:
people['word_count'] = people['text'].apply(count_words)
people.head()

Unnamed: 0,URI,name,text,word_count
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...,"{u'selection': 1, u'carltons': 1, u'being': 1,..."
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...,"{u'precise': 1, u'thomas': 1, u'closely': 1, u..."
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...,"{u'just': 1, u'issued': 1, u'mainly': 1, u'nom..."
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...,"{u'all': 1, u'bauforschung': 1, u'just': 1, u'..."
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{u'they': 1, u'gangstergenka': 1, u'legendaarn..."


In [22]:
#count_vectorizer = CountVectorizer()
#count = count_vectorizer.fit_transform(people['text'])

In [23]:
#voc = count_vectorizer.vocabulary_
#rvoc = dict(zip(voc.itervalues(),voc.iterkeys()))  #index to word


In [24]:
rvoc

{0: u'00',
 1: u'000',
 2: u'0000',
 3: u'00000',
 4: u'00000van',
 5: u'0001',
 6: u'00014338',
 7: u'0001sec',
 8: u'0002',
 9: u'00026',
 10: u'0003',
 11: u'0005',
 12: u'000577',
 13: u'0005sec',
 14: u'0006',
 15: u'0007',
 16: u'0007105916',
 17: u'0007200374',
 18: u'0007207328',
 19: u'0007213506',
 20: u'000721426xhe',
 21: u'0007a',
 22: u'000he',
 23: u'000in',
 24: u'000m',
 25: u'000seelenprojekt',
 26: u'000tnmickushina',
 27: u'001',
 28: u'0017',
 29: u'001cd',
 30: u'001ehebbm',
 31: u'002',
 32: u'0020849605',
 33: u'0024',
 34: u'0026183900',
 35: u'002864574x',
 36: u'0028659287',
 37: u'003',
 38: u'0033',
 39: u'0034',
 40: u'0036',
 41: u'004',
 42: u'0043',
 43: u'0046',
 44: u'004erdemir',
 45: u'005',
 46: u'006',
 47: u'0060222425',
 48: u'0060628227',
 49: u'0060628464',
 50: u'0060669667',
 51: u'006074393x',
 52: u'0064',
 53: u'0066',
 54: u'007',
 55: u'0070710481',
 56: u'0071357440',
 57: u'0071375627',
 58: u'0072131772',
 59: u'0072131896',
 60: u'0

In [25]:
tfidftransformer = TfidfTransformer(norm=None)
tfidf_weight = tfidftransformer.fit_transform(count)

In [26]:
count.shape[0]

59071

In [27]:
count[0]

<1x548429 sparse matrix of type '<type 'numpy.int64'>'
	with 138 stored elements in Compressed Sparse Row format>

In [28]:
#def compute_tfidf(count,tfidf_M,voc):
  #  ans = []
  #  num = count.shape[0]
  #  for i in range(0,num):
   #     idx = np.where(count[i]!=np.zeros(count[i].shape))[1]
    #    dic = {}
    #    dic = dict((voc[index],tfidf_M[i,index]) for index in idx)
    #    ans.append(dic)
 #   return ans

In [29]:
#def compute(i):
   # idx = np.where(count[i]!=np.zeros(count[i].shape))[1]
   # dic = {}
   # return dict((rvoc[index],tfidf_weight[i,index]) for index in idx)

## Examine the TF-IDF for the Obama article

In [30]:
obama = people[people['name'] == 'Barack Obama']

In [31]:
obama.index[0]

35817

In [32]:
tfidf_weight[35817].indices[0:5]   #index of word

array([ 96612, 259893, 533439, 488148,  58906], dtype=int32)

In [33]:
tfidf_weight[35817].data[0:5]  #weight of word

array([  2.53638213,   2.05523155,   3.60739453,  40.00406304,  21.01564776])

In [34]:
rvoc[100985]  #index to word

u'brk'

In [35]:
word = [rvoc[i] for i in tfidf_weight[obama.index[0]].indices]

In [36]:
obama_word_count_table = pd.DataFrame({'word':word,'tfidf':tfidf_weight[obama.index[0]].data})
obama_word_count_table.sort_values(by='tfidf',ascending =False).head()

Unnamed: 0,tfidf,word
171,52.277114,obama
3,40.004063,the
177,35.674051,act
5,30.028962,in
155,21.741728,iraq


Words with highest TF-IDF are much more informative.

# Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.  

In [37]:
clinton = people[people['name'] == 'Bill Clinton']

In [38]:
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton than to Beckham?

We will use cosine distance, which is given by

(1-cosine_similarity) 

and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [39]:
from sklearn.metrics.pairwise import cosine_distances

In [40]:
cosine_distances(tfidf_weight[obama.index[0]],tfidf_weight[clinton.index[0]])[0][0]

0.67497775265246496

In [41]:
cosine_distances(tfidf_weight[obama.index[0]],tfidf_weight[beckham.index[0]])[0][0]

0.84204539753103269

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [42]:
people['name'].head()

0          Digby Morrell
1         Alfred J. Lewy
2          Harpdog Brown
3    Franz Rottensteiner
4                 G-Enka
Name: name, dtype: object

In [43]:
from sklearn.neighbors import KNeighborsClassifier

In [44]:
knn_model = KNeighborsClassifier(algorithm='brute',metric='cosine')
knn_model.fit(tfidf_weight,people['name'])

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [45]:
tfidf_weight

<59071x548429 sparse matrix of type '<type 'numpy.float64'>'
	with 10244028 stored elements in Compressed Sparse Row format>

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [46]:
dist,ind= knn_model.kneighbors(tfidf_weight[obama.index[0]])

In [47]:
dist

array([[ 0.        ,  0.57078068,  0.61593412,  0.62499347,  0.64976509]])

In [48]:
ind

array([[35817, 24478, 57108, 38376, 38714]])

In [49]:
ind[0]

array([35817, 24478, 57108, 38376, 38714])

In [50]:
#pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

In [51]:
pd.DataFrame({'reference_label':people.iloc[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
35817,0.0,Barack Obama
24478,0.570781,Joe Biden
57108,0.615934,Hillary Rodham Clinton
38376,0.624993,Samantha Power
38714,0.649765,Eric Stern (politician)


As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.  

## Other examples of document retrieval

In [52]:
swift = people[people['name'] == 'Taylor Swift']

In [53]:
dist,ind = knn_model.kneighbors(tfidf_weight[swift.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix
  


Unnamed: 0,distance,reference_label
54264,0.0,Taylor Swift
317,0.616139,Carrie Underwood
27793,0.624745,Adele
29297,0.637545,Kelly Clarkson
1341,0.648704,Dolly Parton


In [54]:
jolie = people[people['name'] == 'Angelina Jolie']

In [55]:
dist,ind = knn_model.kneighbors(tfidf_weight[jolie.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
39521,0.0,Angelina Jolie
29009,0.627905,Barbara Hershey
57434,0.63377,Glenn Close
34756,0.643835,Maggie Smith
44992,0.649956,Julianne Moore


In [56]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [57]:
dist,ind = knn_model.kneighbors(tfidf_weight[arnold.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
16018,0.0,Arnold Schwarzenegger
35293,0.739783,Paul Grant (bodybuilder)
58965,0.746563,Bonnie Garcia
36682,0.759803,Abel Maldonado
10499,0.767697,David Israel


# Question 1

In [58]:
elton = people[people['name']=='Elton John']

In [59]:
word = [rvoc[i] for i in count[elton.index[0]].indices]

In [60]:
elton_word_count_table = pd.DataFrame({'word':word,'count':count[elton.index[0]].data})

In [61]:
elton_word_count_table.sort_values(by='count',ascending=False).head()

Unnamed: 0,count,word
246,27,the
244,18,in
245,15,and
237,13,of
219,9,has


In [62]:
elton_word_tfidf_table = pd.DataFrame({'word':word,'tfidf':tfidf_weight[elton.index[0]].data})


In [63]:
elton_word_tfidf_table.sort_values(by='tfidf',ascending=False).head()

Unnamed: 0,tfidf,word
5,27.002743,brits
178,21.29863,1988
92,20.938563,globe
188,20.465179,two
244,20.081204,in


# Question 2

In [64]:
victoria = people[people['name']=='Victoria Beckham']

In [65]:
paul = people[people['name']=='Paul McCartney']

In [66]:
cosine_distances(tfidf_weight[elton.index[0]],tfidf_weight[victoria.index[0]])[0][0]

0.85192118138271955

In [67]:
cosine_distances(tfidf_weight[elton.index[0]],tfidf_weight[paul.index[0]])[0][0]

0.69231324786877968

# Question 3

In [68]:
knn_model_wordcounts = KNeighborsClassifier(algorithm='brute',metric='cosine')
knn_model_wordcounts.fit(count,people['name'])

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [69]:
knn_model_tfidf = KNeighborsClassifier(algorithm='brute',metric='cosine')
knn_model_tfidf.fit(tfidf_weight,people['name'])

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='cosine',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [70]:
dist,ind = knn_model_wordcounts.kneighbors(tfidf_weight[elton.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
19923,0.237925,Elton John
28825,0.519477,Rod Stewart
29297,0.536914,Kelly Clarkson
17505,0.537179,George Michael
16423,0.5393,Usher (entertainer)


In [71]:
dist,ind = knn_model_tfidf.kneighbors(tfidf_weight[elton.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
19923,2.220446e-16,Elton John
28825,0.5893611,Rod Stewart
31595,0.6336579,Phil Collins
27793,0.6365243,Adele
26049,0.6423975,Sting (musician)


In [72]:
dist,ind = knn_model_wordcounts.kneighbors(tfidf_weight[victoria.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
50411,0.28401,Victoria Beckham
23386,0.659912,David Beckham
56064,0.668681,Yuliya Polishchuk
58438,0.676464,Mona al Mansouri
6635,0.682756,Wal%C3%A9 Adeyemi


In [73]:
dist,ind = knn_model_tfidf.kneighbors(tfidf_weight[victoria.index[0]])
pd.DataFrame({'reference_label':people.ix[ind[0]]['name'],'distance':dist[0]})

Unnamed: 0,distance,reference_label
50411,2.220446e-16,Victoria Beckham
23386,0.5464767,David Beckham
17264,0.7184218,Mel B
39144,0.7459557,Stephen Dow Beckham
5385,0.7518478,Hilary Alexander
