# Document retrieval from wikipedia data

## Fire up Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from operator import itemgetter

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.neighbors import NearestNeighbors

from sklearn.linear_model import LogisticRegression

# Load some text data - from wikipedia, pages on people

In [2]:
people = pd.read_csv('people_wiki.csv', index_col='name')

Data contains:  link to wikipedia article, name of person, text of article.

In [3]:
people.head()

Unnamed: 0_level_0,URI,text
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Digby Morrell,<http://dbpedia.org/resource/Digby_Morrell>,digby morrell born 10 october 1979 is a former...
Alfred J. Lewy,<http://dbpedia.org/resource/Alfred_J._Lewy>,alfred j lewy aka sandy lewy graduated from un...
Harpdog Brown,<http://dbpedia.org/resource/Harpdog_Brown>,harpdog brown is a singer and harmonica player...
Franz Rottensteiner,<http://dbpedia.org/resource/Franz_Rottensteiner>,franz rottensteiner born in waidmannsfeld lowe...
G-Enka,<http://dbpedia.org/resource/G-Enka>,henry krvits born 30 december 1974 in tallinn ...


In [4]:
people.shape

(59071, 2)

# Explore the dataset and checkout the text it contains

## Exploring the entry for president Obama

In [5]:
obama = people[people.index == 'Barack Obama'].copy()

In [6]:
obama

Unnamed: 0_level_0,URI,text
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barack Obama,<http://dbpedia.org/resource/Barack_Obama>,barack hussein obama ii brk husen bm born augu...


In [7]:
print obama['text'].values[0]

barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 and after

## Exploring the entry for actor George Clooney

In [8]:
clooney = people[people.index == 'George Clooney']
print clooney['text'].values[0]

george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety of leading roles in films including the superhero film batman robin 1997 and the crime comedy out of sight 1998 in which he first worked with a director who would become a longtime collaborator steven soderbergh in 1999 clooney took the lead role in three kings a wellreceived war satire set during the gulf warin 2001 clooneys fame widened with the release of his biggest commercial success the heist comedy oceans eleven the first of the film trilogy a remake of the 1960 film with frank sinatra as da

# Get the word counts for Obama article

In [9]:
count_vectorizer    = CountVectorizer()
obama['word_count'] = count_vectorizer.fit_transform(obama['text'].values)

## Showing the features (i.e. words)

In [10]:
print count_vectorizer.get_feature_names()

[u'13th', u'1961', u'1992', u'1996', u'1997', u'20', u'2000in', u'2004', u'2007', u'2008', u'2009', u'2010', u'2011', u'2012', u'2012obama', u'2013', u'44th', u'63', u'act', u'address', u'administration', u'affordable', u'afghanistan', u'african', u'after', u'against', u'american', u'americans', u'and', u'arms', u'as', u'ask', u'at', u'attention', u'attorney', u'august', u'barack', u'before', u'began', u'bin', u'bm', u'born', u'briefs', u'brk', u'budget', u'by', u'californias', u'called', u'campaign', u'care', u'chicago', u'civil', u'clinton', u'close', u'columbia', u'combat', u'community', u'constitutional', u'consumer', u'continued', u'control', u'convention', u'court', u'creation', u'cuba', u'current', u'death', u'debate', u'debt', u'defeated', u'defeating', u'defense', u'degree', u'delegates', u'democratic', u'district', u'doddfrank', u'domestic', u'dont', u'down', u'during', u'earning', u'economic', u'election', u'elementary', u'ended', u'ending', u'equality', u'federal', u'filed'

## Sort the word counts for the Obama article

In [11]:
vocab  = list(count_vectorizer.get_feature_names())
counts = obama['word_count'].values.sum(axis=0).toarray()[0]

freq_distribution = Counter(dict(zip(vocab, counts)))
print (freq_distribution.most_common(10))

[(u'the', 40), (u'in', 30), (u'and', 21), (u'of', 18), (u'to', 14), (u'his', 11), (u'obama', 9), (u'act', 8), (u'he', 7), (u'law', 6)]


Most common words include uninformative words like "the", "in", "and",...

# Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [12]:
tfidf_vectorizer = TfidfVectorizer()
people['tfidf']  = list(tfidf_vectorizer.fit_transform(people['text']))

In [13]:
people.shape

(59071, 3)

In [14]:
len(tfidf_vectorizer.vocabulary_)

548429

## Examine the TF-IDF for the Obama article

In [15]:
obama = people[people.index == 'Barack Obama'].copy()

In [16]:
obama['tfidf'].values[0]

<1x548429 sparse matrix of type '<type 'numpy.float64'>'
	with 270 stored elements in Compressed Sparse Row format>

In [17]:
vocab = tfidf_vectorizer.get_feature_names()

response = obama['tfidf'].values[0]
l = [(vocab[col], response[0, col]) for col in response.nonzero()[1]]
l = sorted(l, key=itemgetter(1),reverse=True)
print l[:10]

[(u'obama', 0.36501758981877808), (u'the', 0.27932274000236151), (u'act', 0.2490890416206761), (u'in', 0.20967299876631698), (u'iraq', 0.15180855532927304), (u'and', 0.14673880270062417), (u'law', 0.14468744228550123), (u'control', 0.13185717906932251), (u'of', 0.1262048162788276), (u'us', 0.12283397315748205)]


Words with highest TF-IDF are much more informative.

# Eliminating stopwords

In [18]:
count_vectorizer    = CountVectorizer(stop_words='english')
obama['word_count'] = count_vectorizer.fit_transform(obama['text'].values)

vocab  = list(count_vectorizer.get_feature_names())
counts = obama['word_count'].values.sum(axis=0).toarray()[0]

freq_distribution = Counter(dict(zip(vocab, counts)))
print (freq_distribution.most_common(10))

[(u'obama', 9), (u'act', 8), (u'law', 6), (u'military', 4), (u'iraq', 4), (u'control', 4), (u'president', 4), (u'democratic', 4), (u'2011', 3), (u'school', 3)]


In [19]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
people['tfidf']  = list(tfidf_vectorizer.fit_transform(people['text']))

vocab = tfidf_vectorizer.get_feature_names()

obama    = people[people.index == 'Barack Obama'].copy()
response = obama['tfidf'].values[0]
l        = [(vocab[col], response[0, col]) for col in response.nonzero()[1]]
l        = sorted(l, key=itemgetter(1),reverse=True)
print l[:10]

[(u'obama', 0.41349455267538882), (u'act', 0.28216985897150365), (u'iraq', 0.1719698240003707), (u'law', 0.16390297589574321), (u'control', 0.14936876138866909), (u'ordered', 0.13863257254400477), (u'military', 0.1353676447383487), (u'democratic', 0.12979169658143577), (u'response', 0.12482116078603717), (u'involvement', 0.12482116078603717)]


# Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.  

In [20]:
clinton = people[people.index == 'Bill Clinton'].copy()

In [21]:
beckham = people[people.index == 'David Beckham'].copy()

## Is Obama closer to Clinton than to Beckham?

We will find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [22]:
cosine_similarity(obama['tfidf'].values[0], clinton['tfidf'].values[0])

array([[ 0.18896718]])

In [23]:
cosine_similarity(obama['tfidf'].values[0], beckham['tfidf'].values[0])

array([[ 0.02556581]])

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [24]:
tfidf_matrix = tfidf_vectorizer.fit_transform(people['text'])
nbrs         = NearestNeighbors(n_neighbors=10).fit(tfidf_matrix)

In [25]:
def get_closest_neighbors(name):
    row                = people.index.get_loc(name)
    distances, indices = nbrs.kneighbors(tfidf_matrix.getrow(row))
    names_similar      = pd.Series(indices.flatten()).map(people.reset_index()['name'])
    result             = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [26]:
get_closest_neighbors('Barack Obama')

Unnamed: 0,distance,name
0,0.0,Barack Obama
1,1.165145,Joe Biden
2,1.207369,Samantha Power
3,1.21964,Hillary Rodham Clinton
4,1.222509,Eric Stern (politician)
5,1.236178,Robert Gibbs
6,1.243057,Henry Waxman
7,1.244667,Jesse Lee (politician)
8,1.248296,Eric Holder
9,1.251607,Joe the Plumber


As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.  

## Other examples of document retrieval

In [27]:
get_closest_neighbors('Taylor Swift')

Unnamed: 0,distance,name
0,0.0,Taylor Swift
1,1.183004,Carrie Underwood
2,1.187754,Al Swift
3,1.193938,Ed Sheeran
4,1.197285,Tim McGraw
5,1.199189,Kelly Clarkson
6,1.19979,Adele
7,1.204965,Bill Swift
8,1.207081,Dolly Parton
9,1.208139,Joss Stone


In [28]:
get_closest_neighbors('Angelina Jolie')

Unnamed: 0,distance,name
0,0.0,Angelina Jolie
1,1.173973,Brad Pitt
2,1.241878,Keith Jolie
3,1.25319,Jodie Foster
4,1.254573,Maggie Smith
5,1.259312,Jessica Chastain
6,1.26016,Anne Hathaway
7,1.262106,Nicole Kidman
8,1.262143,Barry Voight
9,1.263898,Billy Bob Thornton


In [29]:
get_closest_neighbors('Arnold Schwarzenegger')

Unnamed: 0,distance,name
0,0.0,Arnold Schwarzenegger
1,1.259683,Bonnie Garcia
2,1.263233,Paul Grant (bodybuilder)
3,1.283846,Gray Davis
4,1.284463,James Tramel
5,1.2851,Abel Maldonado
6,1.29324,Bruce McPherson
7,1.294107,Charlene Zettel
8,1.301621,Russell Gould
9,1.301828,David Israel


# Quizz questions

### Answer 1

In [30]:
elton = people[people.index == 'Elton John'].copy()

In [31]:
elton

Unnamed: 0_level_0,URI,text,tfidf
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Elton John,<http://dbpedia.org/resource/Elton_John>,sir elton hercules john cbe born reginald kenn...,"(0, 96562)\t0.0131480794741\n (0, 704)\t0.0..."


In [32]:
print elton['text'].values[0]

sir elton hercules john cbe born reginald kenneth dwight 25 march 1947 is an english singer songwriter composer pianist record producer and occasional actor he has worked with lyricist bernie taupin as his songwriter partner since 1967 they have collaborated on more than 30 albums to datein his fivedecade career elton john has sold more than 300 million records making him one of the bestselling music artists in the world he has more than fifty top 40 hits including seven consecutive no 1 us albums 58 billboard top 40 singles 27 top 10 four no 2 and nine no 1 for 31 consecutive years 19702000 he had at least one song in the billboard hot 100 his single something about the way you look tonightcandle in the wind 1997 sold over 33 million copies worldwide and is the bestselling single of all time he has received six grammy awards five brit awards winning two awards for outstanding contribution to music and the first brits icon in 2013 for his lasting impact on british culture an academy aw

In [33]:
elton['word_count'] = count_vectorizer.fit_transform(elton['text'].values)

In [34]:
print count_vectorizer.get_feature_names()

[u'10', u'100', u'1947', u'1967', u'19702000', u'1976', u'1980s', u'1988', u'1992', u'1994', u'1996', u'1997', u'1998', u'200', u'2002', u'2004', u'2005', u'2008', u'2012he', u'2013', u'2014', u'21', u'25', u'27', u'30', u'300', u'31', u'33', u'40', u'49', u'58', u'abbey', u'academy', u'actor', u'aids', u'albums', u'alltime', u'announced', u'annual', u'artist', u'artists', u'authors', u'award', u'awards', u'began', u'bernie', u'bestselling', u'billboard', u'bisexual', u'born', u'brit', u'british', u'brits', u'buckingham', u'career', u'cbe', u'center', u'champion', u'charitable', u'civil', u'collaborated', u'commander', u'composer', u'composers', u'concert', u'consecutive', u'continues', u'contribution', u'copies', u'culture', u'datein', u'david', u'december', u'diamond', u'diana', u'disney', u'dwight', u'elizabeth', u'elton', u'empire', u'england', u'english', u'entered', u'era', u'established', u'events', u'fame', u'fellow', u'fight', u'film', u'fivedecade', u'foundation', u'funeral',

In [35]:
vocab  = list(count_vectorizer.get_feature_names())
counts = elton['word_count'].values.sum(axis=0).toarray()[0]

freq_distribution = Counter(dict(zip(vocab, counts)))
print (freq_distribution.most_common(10))
#WORD COUNTS
#john = 7
#award = 5
#billboard = 4

[(u'john', 7), (u'award', 5), (u'billboard', 4), (u'elton', 3), (u'million', 3), (u'awards', 3), (u'100', 3), (u'british', 3), (u'music', 3), (u'academy', 3)]


In [36]:
elton['tfidf'].values[0]

<1x548115 sparse matrix of type '<type 'numpy.float64'>'
	with 198 stored elements in Compressed Sparse Row format>

In [37]:
vocab = tfidf_vectorizer.get_feature_names()

response = elton['tfidf'].values[0]
l = [(vocab[col], response[0, col]) for col in response.nonzero()[1]]
l = sorted(l, key=itemgetter(1),reverse=True)
print l[:10]
#TF-IDF
#billboard > johm > elton

[(u'billboard', 0.220815373479789), (u'john', 0.21708234783751851), (u'elton', 0.21217449599445887), (u'furnish', 0.20819360547352062), (u'songwriters', 0.13727795403933182), (u'award', 0.13644562687929401), (u'aids', 0.12707713850247651), (u'million', 0.12495534247002375), (u'100', 0.12360741560195804), (u'palace', 0.12242653494397877)]


### Answer 2

In [38]:
victoria = people[people.index == 'Victoria Beckham'].copy()

In [39]:
paul = people[people.index == 'Paul McCartney'].copy()

In [40]:
cosine_similarity(elton['tfidf'].values[0], victoria['tfidf'].values[0])

array([[ 0.03407023]])

In [41]:
cosine_similarity(elton['tfidf'].values[0], paul['tfidf'].values[0])

array([[ 0.18991373]])

Elton John está mais próximo de Paul McCartney pois ambos faziam parte do mesmo grupo.

### Answer 3

##### Modelo com TF-IDF





In [42]:
tfidf_matrix = tfidf_vectorizer.fit_transform(people['text'])
nbrs         = NearestNeighbors(n_neighbors=10).fit(tfidf_matrix)

In [43]:
tfidf_matrix

<59071x548115 sparse matrix of type '<type 'numpy.float64'>'
	with 8078359 stored elements in Compressed Sparse Row format>

In [44]:
def get_closest_neighbors(name):
    row                = people.index.get_loc(name)
    distances, indices = nbrs.kneighbors(tfidf_matrix.getrow(row))
    names_similar      = pd.Series(indices.flatten()).map(people.reset_index()['name'])
    result             = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

##### Modelo com Word Count





In [45]:
count_vectorizer    = CountVectorizer()
wc_matrix = count_vectorizer.fit_transform(people['text'])

In [46]:
wc_matrix

<59071x548429 sparse matrix of type '<type 'numpy.int64'>'
	with 10244028 stored elements in Compressed Sparse Row format>

In [47]:
nbrs_wc = NearestNeighbors(n_neighbors=10).fit(wc_matrix)

In [48]:
def get_closest_neighbors_wc(name):
    row                = people.index.get_loc(name)
    distances, indices = nbrs_wc.kneighbors(wc_matrix.getrow(row))
    names_similar      = pd.Series(indices.flatten()).map(people.reset_index()['name'])
    result             = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

##### Usando o modelo com TF-IDF





In [49]:
get_closest_neighbors('Elton John')

Unnamed: 0,distance,name
0,0.0,Elton John
1,1.186742,Rod Stewart
2,1.19651,Sting (musician)
3,1.203888,George Michael
4,1.204019,Phil Collins
5,1.218337,Kelly Clarkson
6,1.221504,Usher (entertainer)
7,1.223884,Adele
8,1.229677,Rihanna
9,1.232415,Bryan Adams


In [50]:
get_closest_neighbors('Victoria Beckham')

Unnamed: 0,distance,name
0,0.0,Victoria Beckham
1,1.072772,David Beckham
2,1.264277,Stephen Dow Beckham
3,1.274599,Caroline Rush
4,1.276112,Angelique Westerhof
5,1.280018,Wal%C3%A9 Adeyemi
6,1.280657,Colin McDowell
7,1.280896,Zurain Imam
8,1.281365,Mel B
9,1.281901,Yuliya Polishchuk


##### Usando o modelo com Word Count





In [51]:
get_closest_neighbors_wc('Elton John')

Unnamed: 0,distance,name
0,0.0,Elton John
1,27.748874,Roger Daltrey
2,27.92848,Rod Stewart
3,28.913665,John Ronane
4,29.223278,Matthew Kalman
5,29.223278,Brad Fiedel
6,29.274562,Robert E. Lerner
7,29.274562,Robbie Williams
8,29.291637,Michael White (author)
9,29.444864,Alex Scott (actor)


In [52]:
get_closest_neighbors_wc('Victoria Beckham')

Unnamed: 0,distance,name
0,0.0,Victoria Beckham
1,22.538855,Kelly Bell
2,23.366643,Rhian Samuel
3,23.366643,Rikke Karlsson
4,23.388031,Marie Brassard
5,23.49468,Renee Nele
6,23.49468,Hilary Alexander
7,23.515952,Yeojin Bae
8,23.515952,Ali Hewson
9,23.579652,Gillian Mann
