In [56]:
import pandas as pd
import numpy as np
pd.set_option("display.max_colwidth", 500)

# Load some text data - from wikipedia, pages on people

In [2]:
df = pd.read_csv("people_wiki.csv")

In [11]:
df.head(3)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former australian rules footballer who played with the kangaroos and carlton in the australian football league aflfrom western australia morrell played his early senior football for west perth his 44game senior career for the falcons spanned 19982000 and he was the clubs leading goalkicker in 2000 at the age of 21 morrell was recruited to the australian football league by the kangaroos football club with its third round selection in the 2001 afl rookie...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from university of chicago in 1973 after studying psychiatry pharmacology and ophthalmology he is a full professor and vicechair of the department of psychiatry at ohsu oregon health science university and holds an md and phd prior to moving to oregon in 1981 lewy was at the national institute of mental health nimh in bethesda maryland working with senior colleague thomas wehr in oregon he has worked closely with robert l sack as of december 2005 he had...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player who has been active in canadas blues scene since 1982 hailing from vancouver he crossed tens of thousands of miles playing club dates and festivals in canada the northwestern united states and germanyover the years he has issued seven cds in 1995 his home is where the harp is won the muddy award for the best nw blues release from the cascade blues association in portland oregon as well that year it was nominated for a canadian juno for the best ...


In [5]:
len(df)

59071

# Explore the dataset and checkout the text it contains

In [16]:
obama = df[df.name == "Barack Obama"]

obama.text

35817    barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to ...
Name: text, dtype: object

In [14]:
df[df.name == "George Clooney"].text

38514    george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety...
Name: text, dtype: object

# Get the word counts for Obama article

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

In [20]:
cv = CountVectorizer()
wc = cv.fit_transform(obama.text)

In [25]:
# visualizing the word count
c = Counter({word:wc[0, key] for word, key in cv.vocabulary_.items()})

# Sort the word counts for the Obama article

In [36]:
c.most_common(10)

[('the', 40),
 ('in', 30),
 ('and', 21),
 ('of', 18),
 ('to', 14),
 ('his', 11),
 ('obama', 9),
 ('act', 8),
 ('he', 7),
 ('law', 6)]

# Compute TF-IDF for the corpus

In [76]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords

In [92]:
tfidf = TfidfVectorizer(norm=None, 
                        stop_words=stopwords.words("english"))

data = tfidf.fit_transform(df.text)

In [93]:
tfidf_voc_inv = {v:k for k,v in tfidf.vocabulary_.items()}

# Examine the TF-IDF for the Obama article

In [94]:
bobama_ix = df.index.get_indexer_for(df[df.name == "Barack Obama"].index)

In [95]:
c = Counter({tfidf_voc_inv[key]:data[bobama_ix, key][0,0] for key in np.nonzero(data[bobama_ix, :])[1]})

In [96]:
c.most_common(10)

[('obama', 52.277113834307315),
 ('act', 35.674051187909924),
 ('iraq', 21.741727931276476),
 ('law', 20.721855882367674),
 ('control', 18.884330378434285),
 ('us', 17.592044264666814),
 ('ordered', 17.526980051210632),
 ('military', 17.114203144108135),
 ('democratic', 16.409249536745939),
 ('involvement', 15.780836746511332)]

# Manually compute distances between a few people

In [113]:
clinton = df[df.name == "Bill Clinton"]
clinton_ix = df.index.get_indexer_for(clinton.index)

beckham = df[df.name == "David Beckham"]
beckham_ix = df.index.get_indexer_for(beckham.index)

# Is Obama closer to Clinton than to Beckham?

In [114]:
from sklearn.metrics import pairwise_distances

In [115]:
pairwise_distances(data[bobama_ix, :], data[clinton_ix, :], 
                   metric="cosine")

array([[ 0.80338951]])

In [116]:
pairwise_distances(data[bobama_ix, :], data[beckham_ix, :], 
                   metric="cosine")

array([[ 0.97175843]])