In [1]:
# Initilization
import turicreate as tc
people = tc.SFrame('../../data/people_wiki.sframe/')

In [4]:
# adding word_count column
people['word_count'] = tc.text_analytics.count_words(people['text'])
# adding tfidf column
people['tfidf'] = tc.text_analytics.tf_idf(people['text'])

# Compare top words according to word counts and to TF-IDF

In [17]:
# singling out Elton John
elton = people[people['name'] == 'Elton John']
# 3 words with the highest word counts
elton[['word_count']].stack('word_count',new_column_name=['word','count']).sort('count',ascending=False).print_rows(num_rows=3)
# 3 words with the highest tf-idf
elton[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False).print_rows(num_rows=3)

+------+-------+
| word | count |
+------+-------+
| the  |  27.0 |
|  in  |  18.0 |
| and  |  15.0 |
+------+-------+
[255 rows x 2 columns]

+-----------+--------------------+
|    word   |       tfidf        |
+-----------+--------------------+
|  furnish  | 18.38947183999428  |
|   elton   | 17.482320270031995 |
| billboard | 17.30368095754203  |
+-----------+--------------------+
[255 rows x 2 columns]



# Measuring distance

In [33]:
# the cosine distance between the articles on ‘Elton John’ and ‘Victoria Beckham’
beckham = people[people['name'] == 'Victoria Beckham']
print ('Elton-Beckham\t:', tc.distances.cosine(elton['tfidf'][0],beckham['tfidf'][0]))
# the cosine distance between the articles on ‘Elton John’ and 'Paul McCartney’
macartney = people[people['name'] == 'Paul McCartney']
print ('Elton-MaCartney\t:', tc.distances.cosine(elton['tfidf'][0],macartney['tfidf'][0]))

Elton-Beckham	: 0.9567006376655429
Elton-MaCartney	: 0.8250310029221779


# Building nearest neighbors models with different input features and setting the distance metric

In [34]:
# nearest neighbors models using word counts as feature
tfidf_model = tc.nearest_neighbors.create(people,features=['tfidf'],label='name',distance='cosine')
# nearest neighbors models using tf-idf as feature
wc_model = tc.nearest_neighbors.create(people,features=['word_count'],label='name',distance='cosine')

In [35]:
closest_to_elton_using_wc = wc_model.query(elton)[1]['reference_label']
closest_to_elton_using_tfidf = tfidf_model.query(elton)[1]['reference_label']
closest_to_beckham_using_wc = wc_model.query(beckham)[1]['reference_label']
closest_to_beckham_using_tfidf = tfidf_model.query(beckham)[1]['reference_label']

In [37]:
print ('closest_to_elton_using_wc\t:', closest_to_elton_using_wc)
print ('closest_to_elton_using_tfidf\t:', closest_to_elton_using_tfidf)
print ('closest_to_beckham_using_wc\t:', closest_to_beckham_using_wc)
print ('closest_to_beckham_using_tfidf\t:', closest_to_beckham_using_tfidf)

closest_to_elton_using_wc	: Cliff Richard
closest_to_elton_using_tfidf	: Rod Stewart
closest_to_beckham_using_wc	: Mary Fitzgerald (artist)
closest_to_beckham_using_tfidf	: David Beckham
