# Document retrieval from Wikipedia data

In [None]:
import turicreate

# Load some text data from Wikipedia

In [None]:
people = turicreate.SFrame('people_wiki.sframe')

In [None]:
people

# Explore data

## Taking a look at the entry for President Obama

In [None]:
obama = people[people['name'] == 'Barack Obama']

In [None]:
obama

In [None]:
obama['text']

## Explore the entry for actor George Clooney

In [None]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

# Word counts for Obama acticle

In [None]:
obama['word_count'] = turicreate.text_analytics.count_words(obama['text'])

In [None]:
obama

In [None]:
print (obama['word_count'])

## Find most common words in Obama article

In [None]:
obama.stack('word_count',new_column_name=['word','count'])

In [None]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])

In [None]:
obama_word_count_table

In [None]:
obama_word_count_table.sort('count',ascending=False)

# Compute TF-IDF for the entire corpus of articles

In [None]:
people['word_count'] = turicreate.text_analytics.count_words(people['text'])

In [None]:
people

In [None]:
people['tfidf'] = turicreate.text_analytics.tf_idf(people['text'])

In [None]:
people

## Examine the TF-IDF for the Obama article

In [None]:
obama = people[people['name'] == 'Barack Obama']
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

## Examine the TF-IDF for Clooney

In [None]:
clooney = people[people['name'] == 'George Clooney']
clooney[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

# Manually evaluate the distance between certain people's articles

In [None]:
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton or to Beckham?

In [None]:
turicreate.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])

In [None]:
turicreate.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])

# Apply nearest neighbors for retrieval of Wikipedia articles

## Build the NN model

In [None]:
knn_model = turicreate.nearest_neighbors.create(people,features=['tfidf'],label='name')

## Use model for retrieval... for example, who is closest to Obama?

In [None]:
knn_model.query(obama)

## Other examples of retrieval

In [None]:
swift = people[people['name'] == 'Taylor Swift']

In [None]:
knn_model.query(swift)

In [None]:
jolie = people[people['name'] == 'Angelina Jolie']

In [None]:
knn_model.query(jolie)

In [None]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [None]:
knn_model.query(arnold)