In [2]:
import requests
import nltk, re, pprint, io, json
import matplotlib.pyplot as plt
import pandas as pd

In [3]:
def build_query_url(page):
    # Build query
    queryUrl = "http://en.wikipedia.org/w/api.php/?action=query"
    title = "titles=%s" % page 
    content = "prop=extracts&exlimit=max&explaintext"
    dataformat = "format=json"
    query = "%s&%s&%s&%s" % (queryUrl, title, content, dataformat)
    return query

def get_content(url):
    # Send request and parse response
    json_response = requests.get(url).json()
    pages = json_response['query']['pages']
    key = next(iter(pages.keys()))
    content = pages[key]['extract']
    return content
    
def fetch_content(page):
    url = build_query_url(page)
    content = get_content(url)
    return content

def save_to_file(content, page_name):
    filename = 'congress115/%s.txt' % page_name
    f = open(filename, "a")
    f.write(content)
    f.close()  

In [4]:
# Create a dataframe which contains page names for the 115th congress
url_h115 = 'https://raw.githubusercontent.com/suneman/socialgraphs2018/master/files/data_US_congress/H115.csv'
df = pd.read_csv(url_h115)
page_names = df.WikiPageName

In [32]:
%%time
# Fetch each wikipage and save to a txt file
for page_name in page_names:
    content = fetch_content(page_name)
    save_to_file(content, page_name)

CPU times: user 15.9 s, sys: 1.46 s, total: 17.4 s
Wall time: 3min 4s


## Exercises
### TF-IDF
**Explain in your own words the point of TF-IDF.**
* What does TF stand for?
* What does IDF stand for?

Answer:
* TFIDF (term frequency–inverse document frequency), is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. It is often used as a weighting factor in searches of information retrieval, text mining, and user modeling. The tf–idf value increases proportionally to the number of times a word appears in the document and is offset by the number of documents in the corpus that contain the word, which helps to adjust for the fact that some words appear more frequently in general. Tf–idf is one of the most popular term-weighting schemes today.

* Variations of the tf–idf weighting scheme are often used by search engines as a central tool in scoring and ranking a document's relevance given a user query. tf–idf can be successfully used for stop-words filtering in various subject fields, including text summarization and classification.

### Tokenizing the Wikipedia Pages
We want to find out which words are important for each party, so we're going to create two large documents, one for the Democratic and one for the Republican party. Tokenize the pages, and combine the tokens into one long list including all the pages of the members of the same party. Remember the bullets below for success.
* Exclude the congress members names (since we're interested in the words, not the names).
* Exclude punctuation.
* Exclude stop words (if you don't know what stop words are, go back and read NLPP1e again).
* Exclude numbers (since they're difficult to interpret in the word cloud).
* Set everything to lower case.

*Note that none of the above has to be perfect. It might not be easy to remove all representatives names. And there's some room for improvisation. You can try using stemming. In my own first run the results didn't look so nice, because some pages are very detailed and repeat certain words again and again and again, whereas other pages are very short. For that reason, I decided to use the unique set of words from each page rather than each word in proportion to how it's actually used on that page. Choices like that are up to you.
Now, we're ready to calculate the TF for each word. Use the method of your choice to find the top 5 terms within each party.*

In [5]:
from nltk.corpus import stopwords

In [6]:
import string
all_words = " ".join(page_names)
all_tokens = nltk.word_tokenize(all_words)

# Only take words with a capital start letter
# Replace underscores with whitespaces
# Remove the word "politician" from the names
names = [
    name.replace("_", " ").replace("politician", "") 
    for name in all_tokens 
    if name[0] in string.ascii_uppercase
]

In [8]:
out = "\n".join(names)
f = open("member_names_pretty.txt", "w")
f.write(out)
f.close()

In [54]:
def clean_document(document, unwanted_sentences):
    '''
    INPUT: document (String), unwanted_sentences (String list)
    OUTPUT: stems (String list)
    '''
    # Remove bad sentences, ex full names from document
    for s in unwanted_sentences:
        document = document.replace(s, "")
    
    # Tokenize all words (no digits, no punctuation)
    tokens = re.findall(r'[a-zA-Z]+', document)
    
    # Stemming    
    stemmer = nltk.LancasterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    
    # Stop-word filtering
    tokens = [
        word for word in tokens 
        if word not in stopwords.words('english')
    ]
    
    return tokens  

In [109]:
def build_token_matrix(party):
    token_matrix = []
    names = df[df.Party == party].WikiPageName
    for name in names:
        document = io.open('congress115/%s.txt' % name, 'r').read()
        cleaned = clean_document(document, page_names)
        token_matrix.append(cleaned)
    return token_matrix

In [110]:
### !!! TAKES A LONG TIME!!! :( !!!
republican_token_matrix = build_token_matrix("Republican")
democratic_token_matrix = build_token_matrix("Democratic")

In [124]:
### Serialization
import pickle
filehandler = open("republican_token_matrix.obj","wb")
pickle.dump(republican_token_matrix, filehandler)
filehandler.close()

filehandler = open("democratic_token_matrix.obj","wb")
pickle.dump(democratic_token_matrix, filehandler)
filehandler.close()

In [137]:
file = open("republican_token_matrix.obj",'rb')
republican_token_matrix = pickle.load(file)
file.close()

file = open("democratic_token_matrix.obj",'rb')
democratic_token_matrix = pickle.load(file)
file.close()

In [138]:
## Flatten matrices
republican_tokens = [word for document in republican_token_matrix for word in document]
democratic_tokens = [word for document in democratic_token_matrix for word in document]

* Now, we're ready to calculate the TF for each word. Use the method of your choice to find the top 5 terms within each party.

For TF I will be using the simplest variation, i.e. just the frequency of the word.

$tf(t,d) = f_{t,d}$

In [139]:
republican_fd = nltk.FreqDist(republican_tokens)
democratic_fd = nltk.FreqDist(democratic_tokens)

In [140]:
republican_fd.most_common(5)

[('stat', 3464),
 ('congress', 3083),
 ('vot', 3029),
 ('elect', 2896),
 ('hous', 2603)]

In [141]:
democratic_fd.most_common(5)

[('congress', 3003),
 ('stat', 2557),
 ('elect', 2483),
 ('hous', 2062),
 ('democr', 1970)]

In [162]:
# Extracting TF's
tf_repub = list(
    map(lambda tup: tup[1], republican_fd.most_common())
)

tf_demo = list(
    map(lambda tup: tup[1], democratic_fd.most_common())
)

In [161]:
tf_demo

array(<map object at 0x1377669b0>, dtype=object)

* Next, we calculate IDF for every word.
  * What base logarithm did you use? Is that important?
  
$idf(t,D) = log \left( \frac{N}{df_t} \right)$

In [142]:
republican_most_common_ordered = list(
    map(lambda tup: tup[0], republican_fd.most_common())
)

democratic_most_common_ordered = list(
    map(lambda tup: tup[0], democratic_fd.most_common())
)

In [144]:
import numpy as np
n_repub = len(republican_most_common_ordered)
n_demo = len(democratic_most_common_ordered)

In [145]:
df_republican = np.zeros(n_repub)
for i, token in enumerate(republican_most_common_ordered):
    for document_tokens in republican_token_matrix:
        if token in document_tokens: 
            df_republican[i] += 1

In [148]:
df_democratic = np.zeros(n_demo)
for i, token in enumerate(democratic_most_common_ordered):
    for document_tokens in democratic_token_matrix:
        if token in document_tokens: 
            df_democratic[i] += 1

* Now we are able to calculate the idf's

In [155]:
# Republican
idf_repub = np.log(n_repub / df_republican)

# Democratic
idf_demo = np.log(n_demo / df_democratic)

**Calculating $TF \cdot IDF$**

In [165]:
tf_repub = np.array(tf_repub)
tf_demo = np.array(tf_demo)

In [167]:
tfidf_repub = np.multiply(tf_repub, idf_repub)

In [168]:
tfidf_repub.size

11038

In [174]:
np.multiply(tf_repub, idf_repub)

array([1.31482023e+04, 1.17020518e+04, 1.15093240e+04, ...,
       9.30909914e+00, 9.30909914e+00, 9.30909914e+00])

In [178]:
np.multiply(np.arange(1,10,2), np.arange(0,9,2))

array([ 0,  6, 20, 42, 72])