# Document retrieval from Wikipedia data

In [None]:
!pip install turicreate



In [None]:
import os.path
from google.colab import drive

# mount Google Drive to /content/drive/My Drive/
if os.path.isdir("/content/drive/My Drive/"):
  print("Google Drive already mounted")
else:
  drive.mount('/content/drive')

Google Drive already mounted


In [None]:
import os.path
import urllib.request
import tarfile
import zipfile
import gzip
from shutil import copy

def fetch_remote_datafile(filename, remote_url):
  if os.path.isfile("./" + filename):
    print("already have " + filename + " in workspace")
    return
  print("fetching " + filename + " from " + remote_url + "...")
  urllib.request.urlretrieve(remote_url, "./" + filename)

def cache_datafile_in_drive(filename):
  if os.path.isfile("./" + filename) == False:
    print("cannot cache " + filename + ", it is not in workspace")
    return
  
  data_drive_path = "/content/drive/My Drive/Colab Notebooks/"
  if os.path.isfile(data_drive_path + filename):
    print("" + filename + " has already been stored in Google Drive")
  else:
    print("copying " + filename + " to " + data_drive_path)
    copy("./" + filename, data_drive_path)
  

def load_datafile_from_drive(filename, remote_url=None):
  data_drive_path = "/content/drive/My Drive/Colab Notebooks/"
  if os.path.isfile("./" + filename):
    print("already have " + filename + " in workspace")
  elif os.path.isfile(data_drive_path + filename):
    print("have " + filename + " in Google Drive, copying to workspace...")
    copy(data_drive_path + filename, ".")
  elif remote_url != None:
    fetch_remote_datafile(filename, remote_url)
  else:
    print("error: you need to manually download " + filename + " and put in drive")
    
def extract_datafile(filename, expected_extract_artifact=None):
  if expected_extract_artifact != None and (os.path.isfile(expected_extract_artifact) or os.path.isdir(expected_extract_artifact)):
    
    print("files in " + filename + " have already been extracted")
  elif os.path.isfile("./" + filename) == False:
    print("error: cannot extract " + filename + ", it is not in the workspace")
  else:
    extension = filename.split('.')[-1]
    if extension == "zip":
      print("extracting " + filename + "...")
      data_file = open(filename, "rb")
      z = zipfile.ZipFile(data_file)
      for name in z.namelist():
          print("    extracting file", name)
          z.extract(name, "./")
      data_file.close()
    elif extension == "gz":
      print("extracting " + filename + "...")
      if filename.split('.')[-2] == "tar":
        tar = tarfile.open(filename)
        tar.extractall()
        tar.close()
      else:
        data_zip_file = gzip.GzipFile(filename, 'rb')
        data = data_zip_file.read()
        data_zip_file.close()
        extracted_file = open('.'.join(filename.split('.')[0:-1]), 'wb')
        extracted_file.write(data)
        extracted_file.close()
    elif extension == "tar":
      print("extracting " + filename + "...")
      tar = tarfile.open(filename)
      tar.extractall()
      tar.close()
    elif extension == "csv":
      print("do not need to extract csv")
    else:
      print("cannot extract " + filename)
      
def load_cache_extract_datafile(filename, expected_extract_artifact=None, remote_url=None):
  load_datafile_from_drive(filename, remote_url)
  extract_datafile(filename, expected_extract_artifact)
  cache_datafile_in_drive(filename)
  
load_cache_extract_datafile("people_wiki.sframe.zip", "people_wiki.sframe", "https://d3c33hcgiwev3.cloudfront.net/_UfOzO0jEeiAgQrXx6bp4g_fd80e000ed2311e8a0546b27d475e197_image_train_data.zip?Expires=1582416000&Signature=DzAF-HZwqcLeyAWemCRLejRIWTOWOfIyRF1zRYVYVnM5F3CZYsQWeRcHjPoJ3be8bwq-NNcS-yZWHc93ciJUr0zIQ0JgSM2nWWLfjfaqL8j6T10JcXM4L4yv43O0WDF8Y8URi2q5E33SVqLDDCyZnGFsj8ph0OL~kXOKOvzA6d8_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A")

already have people_wiki.sframe.zip in workspace
files in people_wiki.sframe.zip have already been extracted
people_wiki.sframe.zip has already been stored in Google Drive


# Load some text data from Wikipedia

In [None]:
import turicreate
people = turicreate.SFrame('./people_wiki.sframe')


In [None]:
people

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


# Explore data

## Taking a look at the entry for President Obama

In [None]:
obama = people[people['name'] == 'Barack Obama']

In [None]:
obama

URI,name,text
<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...


In [None]:
obama['text']

dtype: str
Rows: ?
['barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campa

## Explore the entry for actor George Clooney

In [None]:
clooney = people[people['name'] == 'George Clooney']
clooney['text']

dtype: str
Rows: ?
['george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety of leading roles in films including the superhero film batman robin 1997 and the crime comedy out of sight 1998 in which he first worked with a director who would become a longtime collaborator steven soderbergh in 1999 clooney took the lead role in three kings a wellreceived war satire set during the gulf warin 2001 clooneys fame widened with the release of his biggest commercial success the heist comedy oceans eleven the first of the film trilogy a remake of the 1960 film wit

# Word counts for Obama acticle

In [None]:
obama['word_count'] = turicreate.text_analytics.count_words(obama['text'])

In [None]:
Elton = people[people['name'] == 'Elton John']
Elton['word_count'] = turicreate.text_analytics.count_words(Elton['text'])

In [None]:
Elton_word_count_table = Elton[['word_count']].stack('word_count',new_column_name=['word','count']).sort('count',ascending=False)
Elton_word_count_table[0:3]

word,count
the,27.0
in,18.0
and,15.0


In [None]:
people['tfidf'] = turicreate.text_analytics.tf_idf(people['text'])

In [None]:
Elton = people[people['name'] == 'Elton John']
Elton_tfidf_table = Elton[['tfidf']].stack('tfidf',new_column_name=['word','count']).sort('count',ascending=False)
Elton_tfidf_table[0:3]

word,count
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203


In [None]:
Victoria = people[people['name'] == 'Victoria Beckham']
McCartney = people[people['name'] == 'Paul McCartney']


In [None]:
turicreate.distances.cosine(Elton['tfidf'][0],Victoria['tfidf'][0])

0.9567006376655429

In [None]:
turicreate.distances.cosine(Elton['tfidf'][0],McCartney['tfidf'][0])

0.8250310029221779

In [None]:
knn_model = turicreate.nearest_neighbors.create(people,features=['tfidf'],label='name',distance='cosine')

In [None]:
knn_model.query(Elton)

query_label,reference_label,distance,rank
0,Elton John,-2.220446049250313e-16,1
0,Rod Stewart,0.7172196678927374,2
0,George Michael,0.7476009989692848,3
0,Sting (musician),0.7476719544306141,4
0,Phil Collins,0.7511932487904706,5


In [None]:
knn_model.query(Victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.1102230246251563e-16,1
0,David Beckham,0.5481696102632145,2
0,Stephen Dow Beckham,0.7849867068283364,3
0,Mel B,0.8095855234085036,4
0,Caroline Rush,0.81982642291868,5


In [None]:
people['word_count'] = turicreate.text_analytics.count_words(people['text'])
Victoria = people[people['name'] == 'Victoria Beckham']
McCartney = people[people['name'] == 'Paul McCartney']
Elton = people[people['name'] == 'Elton John']

In [None]:
knn_model = turicreate.nearest_neighbors.create(people,features=['word_count'],label='name',distance='cosine')

In [None]:
knn_model.query(Victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.220446049250313e-16,1
0,Mary Fitzgerald (artist),0.2073070361150499,2
0,Adrienne Corri,0.2145097827875479,3
0,Beverly Jane Fry,0.2174664687407927,4
0,Raman Mundair,0.2176954749915048,5


In [None]:
knn_model.query(Elton)

query_label,reference_label,distance,rank
0,Elton John,2.220446049250313e-16,1
0,Cliff Richard,0.1614241525896703,2
0,Sandro Petrone,0.1682254275104111,3
0,Rod Stewart,0.168327165587061,4
0,Malachi O'Doherty,0.177315545978884,5


In [None]:
print (obama['word_count'])

## Find most common words in Obama article

In [None]:
obama.stack('word_count',new_column_name=['word','count'])

In [None]:
obama_word_count_table = obama[['word_count']].stack('word_count', new_column_name = ['word','count'])

In [None]:
obama_word_count_table

In [None]:
obama_word_count_table.sort('count',ascending=False)

# Compute TF-IDF for the entire corpus of articles

In [None]:
people['word_count'] = turicreate.text_analytics.count_words(people['text'])

In [None]:
people

In [None]:
people['tfidf'] = turicreate.text_analytics.tf_idf(people['text'])

In [None]:
people

## Examine the TF-IDF for the Obama article

In [None]:
obama = people[people['name'] == 'Barack Obama']
obama[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

## Examine the TF-IDF for Clooney

In [None]:
clooney = people[people['name'] == 'George Clooney']
clooney[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)

# Manually evaluate the distance between certain people's articles

In [None]:
clinton = people[people['name'] == 'Bill Clinton']
beckham = people[people['name'] == 'David Beckham']

## Is Obama closer to Clinton or to Beckham?

In [None]:
turicreate.distances.cosine(obama['tfidf'][0],clinton['tfidf'][0])

In [None]:
turicreate.distances.cosine(obama['tfidf'][0],beckham['tfidf'][0])

# Apply nearest neighbors for retrieval of Wikipedia articles

## Build the NN model

In [None]:
knn_model = turicreate.nearest_neighbors.create(people,features=['tfidf'],label='name')

## Use model for retrieval... for example, who is closest to Obama?

In [None]:
knn_model.query(obama)

## Other examples of retrieval

In [None]:
swift = people[people['name'] == 'Taylor Swift']

In [None]:
knn_model.query(swift)

In [None]:
jolie = people[people['name'] == 'Angelina Jolie']

In [None]:
knn_model.query(jolie)

In [None]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [None]:
knn_model.query(arnold)