# Script set-up

## Importing packages and setting the working directory

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from nltk import RegexpTokenizer
from nltk.corpus import stopwords
from nltk import FreqDist
import os

In [None]:
os.chdir("/Users/alliesaizan/Documents/Memos")

## Read in the memos

In [None]:
# Read in the Republican memo
f = open("nunes_memo.txt", mode = "r", encoding = "utf-8")

repub_memo = f.readlines()

f.close()

# Read in the Democrat memo
f = open("dems_memo.txt", mode = "r", encoding = "utf-8")
dems_memo = f.readlines()
f.close()

# Data cleaning and word frequencies

## Clean the memo text

In [None]:
# Join every sentence in the memos to one string

# Republicans
repub_memo2 = "".join([i for i in repub_memo])
repub_memo2

# Democrats
dems_memo2 = "".join([i for i in dems_memo])
dems_memo2

del repub_memo, dems_memo

In [None]:
# Word_tokenize
repub_memo_tokenized = RegexpTokenizer(r'\w+').tokenize(repub_memo2)
repub_memo_tokenized= [i.lower() for i in repub_memo_tokenized if i.lower() not in stopwords.words("english") and len(i) > 1]

dems_memo_tokenized = RegexpTokenizer(r'\w+').tokenize(dems_memo2)
dems_memo_tokenized = [i.lower() for i in dems_memo_tokenized if i.lower() not in stopwords.words("english") and len(i) > 1]

In [None]:
# Pull in stopwords-removed memos into one document
documents = [" ".join([i for i in repub_memo_tokenized]), " ".join([i for i in dems_memo_tokenized])]

## Word frequencies in each memo

In [None]:
# Produce word frequencies
repub_frequencies = FreqDist(repub_memo_tokenized).most_common(20)
dem_frequencies = FreqDist(dems_memo_tokenized).most_common(21)

In [None]:
# Append the word frequencies from both memos into one Data Frame
frequencies1 = pd.DataFrame(repub_frequencies, columns = ["frequent_words", "word_frequency"])
frequencies1['party'] = "Republican"

frequencies2 = pd.DataFrame(dem_frequencies, columns = ["frequent_words", "word_frequency"])
frequencies2['party'] = "Democrat"

frequencies = frequencies1.append(frequencies2)

# Term Frequency-Inverse Document Frequency

In [None]:
# Create the TF-IDF vector
tfidf = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=False, sublinear_tf=True, tokenizer=RegexpTokenizer(r'\w+').tokenize)
tfidf_representation = tfidf.fit_transform(documents)

In [None]:
# Generate a dense matric from the TF-IDF matrix (it's currently sparse)
dense = tfidf_representation.todense()[0].tolist()[0]

In [None]:
# Use the dense matrix to find common phrases across memos
temp_dict = {}

phrase_scores = [pair for pair in zip(range(0, len(dense)), dense) if pair[1] > 0]
sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
for phrase, score in [(tfidf.get_feature_names()[word_id], score) for (word_id, score) in sorted_phrase_scores][:20]:
    print('{0: <20} {1}'.format(phrase, score))
    temp_dict[phrase] = score

In [None]:
# Export the data to CSV for Tableau plotting
results = pd.DataFrame(list(temp_dict.items()), columns = ['tfidf_words', 'tfidf_freq'])
results.to_csv("tfidf.csv")
frequencies.to_csv("word_frequencies.csv")