# Document retrieval from wikipedia data


# Load some text data - from wikipedia, pages on people

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk import FreqDist

In [2]:
people = pd.read_csv('people_wiki.csv')

Data contains:  link to wikipedia article, name of person, text of article.

In [3]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [4]:
len(people)

59071

# Explore the dataset and checkout the text it contains

## Exploring the entry for president Obama

In [5]:
obama = people[people['name'] == 'Barack Obama']

In [6]:
john = people[people['name'] == 'Elton John']

In [7]:
obama

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [8]:
obama['text']

35817    barack hussein obama ii brk husen bm born augu...
Name: text, dtype: object

## Exploring the entry for actor George Clooney

In [9]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

38514    george timothy clooney born may 6 1961 is an a...
Name: text, dtype: object

# Get the word counts for Obama article

In [10]:
# obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])

In [11]:
tokens = nltk.tokenize.word_tokenize(obama['text'].iloc[0])
dist=FreqDist(tokens)

In [12]:
tokens1 = nltk.tokenize.word_tokenize(john['text'].iloc[0])
dist1=FreqDist(tokens1)

In [13]:
len(dist1)

255

In [14]:
##########word_tokenize builds word models from sentences. It needs to be fed each sentence one at a time. 
##########Sometimes it will do a relatively poor job when given whole paragraphs or even documents.
# fdist = FreqDist()
# for sentence in nltk.tokenize.sent_tokenize(obama['text'].iloc[0]):
#     for word in nltk.tokenize.word_tokenize(sentence):
#         fdist[word] += 1

## Sort the word counts for the Obama article

### Turning dictonary of word counts into a table

In [15]:
obama_word_count_table=pd.DataFrame.from_dict(dist,orient='index',)
obama_word_count_table.reset_index(level=0, inplace=True)
obama_word_count_table.columns=['word','count']
obama_word_count_table.head()

Unnamed: 0,word,count
0,barack,1
1,hussein,1
2,obama,9
3,ii,1
4,brk,1


In [16]:
john_word_count_table=pd.DataFrame.from_dict(dist1,orient='index',)
john_word_count_table.reset_index(level=0, inplace=True)
john_word_count_table.columns=['word','count']
john_word_count_table.head()

Unnamed: 0,word,count
0,sir,1
1,elton,3
2,hercules,1
3,john,7
4,cbe,1


### Sorting the word counts to show most common words at the top

In [17]:
obama_word_count_table.sort_values(by='count',ascending=False,inplace=True)

In [18]:
obama_word_count_table.head()

Unnamed: 0,word,count
12,the,40
26,in,30
14,and,21
17,of,18
23,to,14


In [19]:
john_word_count_table.sort_values(by='count',ascending=False,inplace=True)
john_word_count_table.head()

Unnamed: 0,word,count
56,the,27
60,in,18
21,and,15
55,of,13
125,a,10


Most common words include uninformative words like "the", "in", "and",...

# Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse.csr import csr_matrix#need this if you want to save tfidf_matrix
# from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [21]:
ti = TfidfVectorizer()

In [22]:
tfidf=ti.fit_transform(people.text)

In [23]:
vocab=ti.vocabulary_

In [24]:
len(vocab)

548429

## Examine the TF-IDF for the Obama article

In [25]:
def get_index(name):
    return people[people['name']==name].index.tolist()

In [26]:
get_index('Barack Obama')

[35817]

In [27]:
get_index('Elton John')

[19923]

In [28]:
John_tfidf=tfidf[19923]

In [29]:
John_tfidf.indices

array([ 96612,    704, 259893, 530685, 533439, 488148,  58906, 251905,
       239271, 202462, 111212, 233774, 525845,  70622, 360068,  28353,
       494541, 261721, 490181,  68179, 199447, 492517, 198987,  49096,
        19167, 151653,  20902,  23034, 207224,  26651, 451997,  27213,
       232352,  48854,  58306, 535489, 153546, 228410, 362795,  83218,
       233049, 291622, 489883, 452265,  84135, 541553, 444813,  74411,
       541127, 530205, 254386, 180881,  43179, 357311, 258054,  73354,
       252531, 512199, 238474, 362987,  16882,  54130, 196687, 505213,
        20271,  31492, 411863,  16121, 311023, 458791, 336620, 412102,
        11557,  13494,  84315,  74748, 487882, 314354, 341055, 397959,
        67216, 197194, 146757, 535924, 459389, 130681, 422943, 374846,
       458494, 369039, 473151,  67431, 267446,   7090, 352169, 204218,
       407720, 411407,  83926, 335621, 353129,  30458, 239943,  16484,
       423966, 138291, 496061,  15450, 424014, 469376, 344260, 368895,
      

In [30]:
# zip tfidf and its indices to dict
dic=dict(zip(John_tfidf.indices, John_tfidf.data))
len(dic)

252

In [31]:
import operator
#sort on values instead of keys:
sorted_dic = sorted(dic.items(), key=operator.itemgetter(1))
# sort on keys instead of values:sorted_dic = sorted(dic.items(), key=operator.itemgetter(0))!!!!!!!!!!!!!!!!!!!!!!!

In [32]:
sorted_dic[-6:]

[(251905, 0.16259599697900187),
 (208739, 0.18122079791032641),
 (178605, 0.18468593871019523),
 (267446, 0.1889579470891368),
 (89663, 0.19220733548400759),
 (488148, 0.24368351964209761)]

In [33]:
# another way to sort on values
list_values=sorted(dic, key=dic.get)
type(list_values)
list_values[-6:]

[251905, 208739, 178605, 267446, 89663, 488148]

In [34]:
for i in list_values[-6:]:
    print(dic[i])

0.162595996979
0.18122079791
0.18468593871
0.188957947089
0.192207335484
0.243683519642


In [35]:
for i in list_values[-6:]:
    print(list(vocab.keys())[list(vocab.values()).index(i)])

in
furnish
elton
john
billboard
the


# Examine the TF-IDF for Any article

In [36]:
def get_top_tfidf(name,number):
    index=get_index(name)
    name_tfidf=tfidf[index]
    import collections
    dic=dict(collections.OrderedDict((k, v) for k, v in zip(name_tfidf.indices, name_tfidf.data) if v is not 0))
    import operator
    list_values=sorted(dic, key=dic.get)#sort on values
    for i in list_values[-number:]:
        print(list(vocab.keys())[list(vocab.values()).index(i)])

In [37]:
get_top_tfidf('Elton John',5)

furnish
elton
john
billboard
the


Words with highest TF-IDF are much more informative.

# Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.  

In [38]:
from sklearn.metrics.pairwise import linear_kernel
cosine_similarities = linear_kernel(tfidf[35817], tfidf).flatten()
cosine_similarities

array([ 0.14730606,  0.05831185,  0.10624378, ...,  0.08004373,
        0.12404702,  0.13839972])

In [39]:
related_docs_indices = cosine_similarities.argsort()[:-5:-1]
related_docs_indices

array([35817, 24478, 57108, 38376])

pay attention to the trick: minus step~!!!!!!!!!!!!
l=list(range(20))
l[:15:-1]
[19,18,17,16]

In [40]:
cosine_similarities[related_docs_indices]

array([ 1.        ,  0.42921932,  0.38406588,  0.37500653])

## Is Obama closer to Clinton than to Beckham?

We will use cosine distance, which is given by

(1-cosine_similarity) 

and find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [41]:
get_index('Bill Clinton')

[36452]

In [42]:
get_index('David Beckham')

[23386]

In [43]:
1-cosine_similarities[36452]

0.67497775265246496

In [44]:
1-cosine_similarities[23386]

0.84204539753103269

In [45]:
cosine_similarities[54264]

0.15416116937122998

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [46]:
# knn_model = graphlab.nearest_neighbors.create(people,features=['tfidf'],label='name')


In [47]:
from sklearn.neighbors import NearestNeighbors

In [48]:
knn=NearestNeighbors(n_neighbors=20,algorithm='brute',metric='cosine')
knn_fit=knn.fit(tfidf)

# Applying the nearest-neighbors model for retrieval-tfidf

## Who is closest to Obama?

In [49]:
# Obama_Neighbours_indices=
def get_neighbors_index(name):
    return knn_fit.kneighbors(tfidf[get_index(name)])[1].tolist()[0]

In [50]:
def cosin_distance(name,index):
    cosine_similarities = linear_kernel(tfidf[get_index(name)], tfidf).flatten()
    return 1-cosine_similarities[index]

In [51]:
def result_df(name):
    Result= pd.DataFrame({'Index':get_neighbors_index(name)})
    Result['name']=Result['Index'].apply(lambda x: people['name'][x])
    Result['Cosine Distance']=Result['Index'].apply(lambda x: cosin_distance(name,x))
    return Result

In [52]:
result_df("Barack Obama")

Unnamed: 0,Index,name,Cosine Distance
0,35817,Barack Obama,-2.220446e-16
1,24478,Joe Biden,0.5707807
2,57108,Hillary Rodham Clinton,0.6159341
3,38376,Samantha Power,0.6249935
4,38714,Eric Stern (politician),0.6497651
5,28447,George W. Bush,0.6586872
6,39357,John McCain,0.6616806
7,48693,Artur Davis,0.6669423
8,18827,Henry Waxman,0.6702047
9,46811,Jeff Sessions,0.6724269


As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.  

## Other examples of document retrieval

In [53]:
# check the distance funciton , distance of oneselft should be small enough!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
result_df('Taylor Swift').head(5)

Unnamed: 0,Index,name,Cosine Distance
0,54264,Taylor Swift,0.0
1,317,Carrie Underwood,0.616139
2,27793,Adele,0.624745
3,29297,Kelly Clarkson,0.637545
4,1341,Dolly Parton,0.648704


In [54]:
df2=result_df('Angelina Jolie')
df2.sort_values(by='Cosine Distance', inplace=True)
df2

Unnamed: 0,Index,name,Cosine Distance
0,39521,Angelina Jolie,6.661338e-16
1,29009,Barbara Hershey,0.627905
2,57434,Glenn Close,0.6337704
3,34756,Maggie Smith,0.6438354
4,44992,Julianne Moore,0.6499563
5,54362,Konkona Sen Sharma,0.6555215
6,16242,Meryl Streep,0.6565404
7,44571,Candice Bergen,0.6571349
8,21644,Jodie Foster,0.6573323
9,51145,Kate Winslet,0.6584207


In [55]:
df3=result_df('Arnold Schwarzenegger')
df3.sort_values(by='Cosine Distance', inplace=True)
df3.head()

Unnamed: 0,Index,name,Cosine Distance
0,16018,Arnold Schwarzenegger,1.110223e-16
1,35293,Paul Grant (bodybuilder),0.7397827
2,58965,Bonnie Garcia,0.7465629
3,36682,Abel Maldonado,0.7598034
4,10499,David Israel,0.7676966


In [56]:
result_df("Elton John")

Unnamed: 0,Index,name,Cosine Distance
0,19923,Elton John,2.220446e-16
1,28825,Rod Stewart,0.5893611
2,31595,Phil Collins,0.6336579
3,27793,Adele,0.6365243
4,26049,Sting (musician),0.6423975
5,17505,George Michael,0.6476146
6,26581,Jay Sean,0.6526952
7,29297,Kelly Clarkson,0.6529692
8,41668,Cliff Richard,0.6542096
9,15936,Bryan Adams,0.6556412


In [57]:
result_df("Victoria Beckham")

Unnamed: 0,Index,name,Cosine Distance
0,50411,Victoria Beckham,5.551115e-16
1,23386,David Beckham,0.5464767
2,17264,Mel B,0.7184218
3,39144,Stephen Dow Beckham,0.7459557
4,5385,Hilary Alexander,0.7518478
5,58438,Mona al Mansouri,0.7535584
6,57583,Caroline Rush,0.7638209
7,32301,Caryn Franklin,0.7645923
8,19061,Emma Bunton,0.7671695
9,36979,Zurain Imam,0.767302


# Assignment

# Applying the nearest-neighbors model for retrieval-words counts

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

In [59]:
cv=CountVectorizer()

In [60]:
coutvec=cv.fit_transform(people.text)

In [61]:
knn_fit_c=knn.fit(coutvec)

In [62]:
knn_fit_c.kneighbors(coutvec[get_index('Elton John')])[1].tolist()[0]

[19923,
 41668,
 25798,
 28825,
 37447,
 51884,
 19983,
 17852,
 50847,
 24770,
 53752,
 23677,
 34450,
 52121,
 34612,
 598,
 31286,
 3019,
 28756,
 21004]

In [63]:
people.iloc[41668]

URI           <http://dbpedia.org/resource/Cliff_Richard>
name                                        Cliff Richard
text    sir cliff richard obe born harry rodger webb 1...
Name: 41668, dtype: object

In [64]:
knn_fit_c.kneighbors(coutvec[get_index('Victoria Beckham')])[1].tolist()[0]

[50411,
 669,
 45129,
 39504,
 13937,
 48867,
 50841,
 41962,
 31221,
 2985,
 7505,
 22590,
 42381,
 57206,
 20389,
 35902,
 16663,
 24613,
 3993,
 14490]

In [65]:
people.iloc[669]

URI     <http://dbpedia.org/resource/Mary_Fitzgerald_(...
name                             Mary Fitzgerald (artist)
text    mary fitzgerald born 1956 is an irish artist w...
Name: 669, dtype: object

In [66]:
get_index('Elton John')

[19923]

In [67]:
get_index('Victoria Beckham')

[50411]

In [68]:
get_index('Paul McCartney')

[53028]

In [69]:
cosine_similarities = linear_kernel(tfidf[19923], tfidf).flatten()

In [70]:
cosin_distance('Elton John',50411)

0.85192118138271955

In [71]:
cosin_distance('Elton John',53028)

0.69231324786877968