In [216]:
# Import all of the things you need to import!
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

pd.options.display.max_columns = 30
%matplotlib inline

# Homework 14 (or so): TF-IDF text analysis and clustering

Hooray, we kind of figured out how text analysis works! Some of it is still magic, but at least the **TF** and **IDF** parts make a little sense. Kind of. Somewhat.

No, just kidding, we're *professionals* now.

## Investigating the Congressional Record

The [Congressional Record](https://en.wikipedia.org/wiki/Congressional_Record) is more or less what happened in Congress every single day. Speeches and all that. A good large source of text data, maybe?

Let's pretend it's totally secret but we just got it leaked to us in a data dump, and we need to check it out. It was leaked from [this page here](http://www.cs.cornell.edu/home/llee/data/convote.html).

In [217]:
# If you'd like to download it through the command line...
!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 9607k  100 9607k    0     0   598k      0  0:00:16  0:00:16 --:--:--  548k


In [218]:
# And then extract it through the command line...
!tar -zxf convote_v1.1.tar.gz

You can explore the files if you'd like, but we're going to get the ones from `convote_v1.1/data_stage_one/development_set/`. It's a bunch of text files.

In [219]:
# glob finds files matching a certain filename pattern
import glob

# Give me all the text files
paths = glob.glob('convote_v1.1/data_stage_one/development_set/*')
paths[:5]

['convote_v1.1/data_stage_one/development_set/052_400011_0327014_DON.txt',
 'convote_v1.1/data_stage_one/development_set/052_400011_0327025_DON.txt',
 'convote_v1.1/data_stage_one/development_set/052_400011_0327044_DON.txt',
 'convote_v1.1/data_stage_one/development_set/052_400011_0327046_DON.txt',
 'convote_v1.1/data_stage_one/development_set/052_400011_1479036_DON.txt']

In [220]:
len(paths)

702

So great, we have 702 of them. Now let's import them.

In [221]:
speeches = []
for path in paths:
    with open(path) as speech_file:
        speech = {
            'pathname': path,
            'filename': path.split('/')[-1],
            'content': speech_file.read()
        }
    speeches.append(speech)
speeches_df = pd.DataFrame(speeches)
speeches_df.head()

Unnamed: 0,content,filename,pathname
0,"mr. chairman , i thank the gentlewoman for yie...",052_400011_0327014_DON.txt,convote_v1.1/data_stage_one/development_set/05...
1,"mr. chairman , i want to thank my good friend ...",052_400011_0327025_DON.txt,convote_v1.1/data_stage_one/development_set/05...
2,"mr. chairman , i rise to make two fundamental ...",052_400011_0327044_DON.txt,convote_v1.1/data_stage_one/development_set/05...
3,"mr. chairman , reclaiming my time , let me mak...",052_400011_0327046_DON.txt,convote_v1.1/data_stage_one/development_set/05...
4,"mr. chairman , i thank my distinguished collea...",052_400011_1479036_DON.txt,convote_v1.1/data_stage_one/development_set/05...


In class we had the `texts` variable. For the homework can just do `speeches_df['content']` to get the same sort of list of stuff.

**Take a look at the contents of the first 5 speeches**

In [222]:
All_speeches = speeches_df['content']
First_five_speeches = speeches_df['content'].head(5)
First_five_speeches

0    mr. chairman , i thank the gentlewoman for yie...
1    mr. chairman , i want to thank my good friend ...
2    mr. chairman , i rise to make two fundamental ...
3    mr. chairman , reclaiming my time , let me mak...
4    mr. chairman , i thank my distinguished collea...
Name: content, dtype: object

# Doing our analysis

Use the `sklearn` package and a plain boring `CountVectorizer` to get a list of all of the tokens used in the speeches. If it won't list them all, that's ok! Make a dataframe with those terms as columns.

**Be sure to include English-language stopwords**

In [223]:
count_vectorizer = CountVectorizer(stop_words='english')

In [224]:
speech_tokens = count_vectorizer.fit_transform(All_speeches)

In [225]:
count_vectorizer.get_feature_names()

['000',
 '00007',
 '018',
 '050',
 '092',
 '10',
 '100',
 '106',
 '107',
 '108',
 '108th',
 '109th',
 '10th',
 '11',
 '110',
 '114',
 '117',
 '118',
 '11th',
 '12',
 '120',
 '121',
 '122',
 '123',
 '125',
 '128',
 '12898',
 '13',
 '13279',
 '1332',
 '1335',
 '1344',
 '135',
 '138',
 '14',
 '140',
 '143',
 '144',
 '145',
 '149',
 '1498',
 '14th',
 '15',
 '150',
 '1520',
 '153',
 '155',
 '159',
 '16',
 '160',
 '162',
 '163',
 '165',
 '1671',
 '1675',
 '17',
 '170',
 '1700',
 '174',
 '178',
 '1787',
 '17th',
 '18',
 '180',
 '1800',
 '1800s',
 '181',
 '1812',
 '1855',
 '186',
 '1868',
 '18th',
 '19',
 '190',
 '1907',
 '1922',
 '1927',
 '1930',
 '1940s',
 '1950s',
 '196',
 '1960',
 '1960s',
 '1964',
 '1965',
 '1967',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1976',
 '1979',
 '198',
 '1980s',
 '1981',
 '1982',
 '1983',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '19th',
 '1st'

In [226]:
All_tokens = pd.DataFrame(speech_tokens.toarray(), columns=count_vectorizer.get_feature_names())

In [227]:
#All_tokens

Okay, it's **far** too big to even look at. Let's try to get a list of features from a new `CountVectorizer` that only takes the top 100 words.

In [228]:
count_vectorizer_100 = CountVectorizer(max_features=100, stop_words='english')

In [229]:
speech_tokens_top100 = count_vectorizer_100.fit_transform(speeches_df['content'])

Now let's push all of that into a dataframe with nicely named columns.

In [230]:
Top_100_tokens = pd.DataFrame(speech_tokens_top100.toarray(), columns=count_vectorizer_100.get_feature_names())
Top_100_tokens.head()

Unnamed: 0,000,11,act,allow,amendment,america,american,amp,association,balance,based,believe,bipartisan,chairman,children,...,teachers,thank,think,time,today,trade,united,urge,vote,want,way,work,year,years,yield
0,0,1,3,0,0,0,3,0,0,0,0,1,0,3,0,...,0,1,3,3,2,0,1,0,0,1,1,0,0,0,1
1,0,0,1,1,1,0,0,0,0,1,0,0,0,2,0,...,0,1,0,2,2,0,0,0,1,1,3,0,1,0,0
2,0,0,0,0,0,0,1,0,0,0,0,0,0,2,0,...,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1
3,0,0,0,0,0,1,0,0,0,1,0,0,0,2,0,...,0,0,0,2,0,0,1,0,1,1,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,...,0,1,0,1,0,0,0,0,2,0,0,0,0,0,2


Everyone seems to start their speeches with "mr chairman" - how many speeches are there total, and many don't mention "chairman" and how many mention neither "mr" nor "chairman"?

In [231]:
speeches_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 702 entries, 0 to 701
Data columns (total 3 columns):
content     702 non-null object
filename    702 non-null object
pathname    702 non-null object
dtypes: object(3)
memory usage: 16.5+ KB


In [237]:
Top_100_tokens['No_chairman'] = Top_100_tokens['chairman'] == 0
Top_100_tokens[Top_100_tokens['No_chairman'] == True].count().head(1)

000    250
dtype: int64

In [238]:
Top_100_tokens['no_mr'] = Top_100_tokens['mr'] == 0
Top_100_tokens[Top_100_tokens['no_mr'] == True].count().head(1)

000    79
dtype: int64

What is the index of the speech thank is the most thankful, a.k.a. includes the word 'thank' the most times?

In [239]:
Top_100_tokens['thank'].sort_values(ascending=False).head(1)

577    9
Name: thank, dtype: int64

If I'm searching for `China` and `trade`, what are the top 3 speeches to read according to the `CountVectoriser`?

In [240]:
Top_100_tokens['china trade'] = Top_100_tokens['china'] + Top_100_tokens['trade']

In [241]:
Top_100_tokens['china trade'].sort_values(ascending=False).head(3)

379    92
399    36
345    27
Name: china trade, dtype: int64

Now what if I'm using a `TfidfVectorizer`?

In [247]:
idf_vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)
Top_100_tokens_idf = idf_vectorizer.fit_transform(All_speeches)
idf_df = pd.DataFrame(Top_100_tokens_idf.toarray(), columns=idf_vectorizer.get_feature_names())
idf_df['china trade'] = idf_df['china'] + idf_df['trade']

In [248]:
idf_df['china trade'].sort_values(ascending=False).head(3)

402    0.909362
345    0.863658
317    0.857680
Name: china trade, dtype: float64

**What's the content of the speeches?** Here's a way to get them:

In [251]:
# index 0 is the first speech, which was the first one imported.
paths[402]

'convote_v1.1/data_stage_one/development_set/421_400387_2010045_DMN.txt'

In [253]:
# Pass that into 'cat' using { } which lets you put variables in shell commands
# that way you can pass the path to cat
!cat {paths[577]}

mr. chairman , i just wanted to remind the house that faith-based organizations can and do sponsor federally funded head start programs . 
any sponsor who will agree not to discriminate in employment , if they can sponsor a program with the discrimination amendment , they can sponsor the program without that amendment if they would agree not to discriminate . 
what we are talking about is discrimination . 
some people want to discriminate against catholics , jews , muslims , african americans . 
we had this discussion in the 1960s , and the consensus back then was that discrimination in employment was so offensive that we made it illegal . 
the victim needs to be protected and the weight of the federal government will fall down on the side of the victim . 
the vote was not unanimous . 
some people did not like it then ; they do not like it now . 
and we are discussing where should the weight of the government be , with the victim or with somebody trying to discriminate . 
this

**Now search for something else!** Another two terms that might show up. `elections` and `chaos`? Whatever you thnik might be interesting.

In [278]:
All_tokens['chaos'] = All_tokens['chaos'].sort_values(ascending=False) >= 1
All_tokens[All_tokens['chaos'] == True].count().head(1)

000    3
dtype: int64

# Enough of this garbage, let's cluster

Using a **simple counting vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

Using a **term frequency vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

Using a **term frequency inverse document frequency vectorizer**, cluster the documents into **eight categories**, telling me what the top terms are per category.

In [None]:
#simple counting vectorizer,

In [291]:
from sklearn.cluster import KMeans
number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)


In [292]:
count_vectorizer = CountVectorizer(stop_words='english')
X = count_vectorizer.fit_transform(All_speeches)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [293]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = count_vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :5]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: start head children program amendment
Cluster 1: head start religious rights civil
Cluster 2: nbsp amp lt gt trade
Cluster 3: mr chairman time gentleman amendment
Cluster 4: association national restaurant contractors chamber
Cluster 5: church financial embezzlement says churches
Cluster 6: rule 11 rules federal 420
Cluster 7: house mr elections time states


In [None]:
# term frequency vectorizer,

In [296]:
vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
X = vectorizer.fit_transform(All_speeches)

In [297]:
number_of_clusters = 8
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [360]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = count_vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: harry said just like hermione eyes know time didn don
Cluster 1: harry said hermione just like know time looked asked ron


In [302]:
#term frequency inverse document frequency vectorizer

In [357]:
def oh_tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    return words

l2_vectorizer = TfidfVectorizer(use_idf=True, stop_words='english', tokenizer=oh_tokenizer) 
X = l2_vectorizer.fit_transform(speeches_df['content'])
l2_df = pd.DataFrame(X.toarray(), columns=l2_vectorizer.get_feature_names())
for i in range(number_of_clusters):
    top_ten_words = [l2_df[ind] for ind in order_centroids[i, :9]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

KeyError: 18966

**Which one do you think works the best?**

Not sure. The last one **term frequency inverse** I can't get to work. So I am going with number 2.

# Harry Potter time

I have a scraped collection of Harry Potter fanfiction at https://github.com/ledeprogram/courses/raw/master/algorithms/data/hp.zip.

I want you to read them in, vectorize them and cluster them. Use this process to find out **the two types of Harry Potter fanfiction**. What is your hypothesis?

In [309]:
!curl -O https://github.com/ledeprogram/courses/raw/master/algorithms/data/hp.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   149  100   149    0     0    131      0  0:00:01  0:00:01 --:--:--   131


In [314]:
!unzip hp.zip

unzip:  cannot find or open hp.zip, hp.zip.zip or hp.zip.ZIP.


In [326]:
import glob

In [333]:
paths = glob.glob('hp/*.txt')

In [334]:
paths[:5]

['hp/10001898.txt',
 'hp/10004131.txt',
 'hp/10004927.txt',
 'hp/10007980.txt',
 'hp/10010343.txt']

In [331]:
len(paths)

1874

In [335]:
Harry_Potter_fiction = []
for path in paths:
    with open(path) as Harry_file:
        speech = {
            'pathname': path,
            'filename': path.split('/')[-1],
            'content': Harry_file.read()
        }
    Harry_Potter_fiction.append(speech)
Harry_df = pd.DataFrame(Harry_Potter_fiction)
Harry_df.head()

Unnamed: 0,content,filename,pathname
0,Prologue: The MissionDisclaimer: All character...,10001898.txt,hp/10001898.txt
1,BlackDisclaimer: I do not own Harry PotterAuth...,10004131.txt,hp/10004131.txt
2,"Chapter 1""I'm pregnant.""""""""Mum please say some...",10004927.txt,hp/10004927.txt
3,"Author's Note: Hey, just so you know, this is ...",10007980.txt,hp/10007980.txt
4,Disclaimer: I do not own Harry Potter and frie...,10010343.txt,hp/10010343.txt


In [337]:
All_of_Harry = Harry_df['content']

In [339]:
All_of_Harry.head()

0    Prologue: The MissionDisclaimer: All character...
1    BlackDisclaimer: I do not own Harry PotterAuth...
2    Chapter 1"I'm pregnant.""""Mum please say some...
3    Author's Note: Hey, just so you know, this is ...
4    Disclaimer: I do not own Harry Potter and frie...
Name: content, dtype: object

# Term Frequency Vectorizer

In [363]:
vectorizer = TfidfVectorizer(use_idf=True, stop_words='english')
X = vectorizer.fit_transform(All_of_Harry)

In [364]:
# KMeans clustering is a method of clustering.
from sklearn.cluster import KMeans

number_of_clusters = 2
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [365]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: lily james sirius remus said harry just eyes potter peter
Cluster 1: harry hermione draco said just ron like ginny know eyes


# Simple Counting Vectorizer

In [347]:
from sklearn.cluster import KMeans
number_of_clusters = 2
km = KMeans(n_clusters=number_of_clusters)

In [354]:
count_vectorizer = CountVectorizer(stop_words='english')
X = count_vectorizer.fit_transform(All_of_Harry)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [361]:
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = count_vectorizer.get_feature_names()
for i in range(number_of_clusters):
    top_ten_words = [terms[ind] for ind in order_centroids[i, :10]]
    print("Cluster {}: {}".format(i, ' '.join(top_ten_words)))

Top terms per cluster:
Cluster 0: harry said just like hermione eyes know time didn don
Cluster 1: harry said hermione just like know time looked asked ron
