In [None]:
#script to calculate the cosine distances between keywords of NYT and Reuters articles
import pandas as pd
import numpy as np
import operator
from collections import Counter
import pickle
import re
import nltk
from sklearn.externals import joblib
from sklearn import feature_extraction
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.spatial.distance

#this is a function that both tokenizes a given sentence/article 
def token(text): 
    # tokenize by sentence and word. this way you ensure you get rid of punctuations
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens_with_letters = []
    # use the regex library to search only for items that contain letters. this will enable you to eliminate punctuation
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            tokens_with_letters.append(token)
    return tokens_with_letters

#makes a single list out of list of lists with string elements in it
def flatten(foo):
    for x in foo:
        if hasattr(x, '__iter__') and not isinstance(x, str):
            for y in flatten(x):
                yield y
        else:
            yield x

#function to get rid of paranthesis and trailing white spaces in the keywords             
def rid_paranthesis_trailing_wspace(words):
    for i in range(len(words)): 
        if "(" in words[i]:
            words[i]=re.match("(.*?)\(",words[i]).group(1) 
            words[i]=words[i].rstrip()            
        else: 
            words[i]=words[i].rstrip()
    return words 

#function to get rid of the \ character in the reuters keywords. treat words separated by this character as two words 
def rid_special_char(words):
    for i in range(len(words)):
        if "/" in words[i]:
            words[i]=words[i].split('/')
    return list(flatten(words))

#this one makes a sentence out of a list of keywords
def make_sentence_out_of_list(words_list):
    sentence=' '.join(word for word in words_list)
    return sentence

#read the reuters and nyt dataframes
nyt_unique=pd.read_csv('nyt_uniques_topicsLabeled.csv')
nyt_labeled_only=nyt_unique.loc[nyt_unique['Section'] != 'Other']
nyt_labeled_only.reset_index(inplace=True)
nyt_labeled_only.drop('index',axis=1,inplace=True)
reuters_unique = pickle.load( open( "20180516-20180621_reuters_unique.pkl", "rb" ) )
reuters_unique.reset_index(inplace=True)
reuters_unique.drop('index',axis=1,inplace=True)

#there is one row that doesn't have keywords for NYT. 
#Might as well manually fix that so we don't run into indexing issues: 
nyt_labeled_only.Keywords[92]='nan'

#for reuters, when we read from the pkl, the keywords are already in "list" format. in NYT this is not the case 
#for one reason or the other. this line of code puts the keywords in NYT in a list, split by a comma
nyt_labeled_only['keyword list']=nyt_labeled_only['Keywords'].apply(lambda x: x.split(',') if(np.all(pd.notnull(x))) else x)

#nyt has "null" values, so we manually make sentences out of lists of keywords by putting an "if" statement 
nyt_labeled_only['sentences']=nyt_labeled_only['keyword list'].apply(lambda words_list:' '.join(word for word in words_list) if(np.all(pd.notnull(words_list))) else words_list)

#only reuters needs the functions below. we first get rid of "\"
#we then get rid of paranthesis and whitespace (because whitespace remains after applying the "rid_special_char" function)
reuters_unique.loc[:, 'keywords'] = reuters_unique.keywords.apply(rid_special_char)
reuters_unique.loc[:, 'keywords'] = reuters_unique.keywords.apply(rid_paranthesis_trailing_wspace)

#we then apply the 'make sentences out of list'. we can apply it here directly bc there are no null values. 
reuters_unique['sentences']=reuters_unique['keywords'].apply(make_sentence_out_of_list)

#make north korea one word: 'nkorea'
#make donald trump one word: 'dtrump'
reuters_unique.loc[:, 'sentences'] = reuters_unique.sentences.str.replace('north korea', 'nkorea', regex=False)
reuters_unique.loc[:, 'sentences'] = reuters_unique.sentences.str.replace('northkorea', 'nkorea', regex=False)
reuters_unique.loc[:, 'sentences'] = reuters_unique.sentences.str.replace('nothkorea', 'nkorea', regex=False)
nyt_labeled_only.loc[:, 'sentences'] = nyt_labeled_only.sentences.str.replace('north korea', 'nkorea', regex=False)
nyt_labeled_only.loc[:, 'sentences'] = nyt_labeled_only.sentences.str.replace('donald trump', 'dtrump', regex=False)
reuters_unique.loc[:, 'sentences'] = reuters_unique.sentences.str.replace('donald trump', 'dtrump', regex=False)

#put the nyt and reuters keywords into one big list
nyt_list=nyt_labeled_only['sentences'].tolist()
reuters_list=reuters_unique['sentences'].tolist()
nyt_list.extend(reuters_list)

#put the keywords into a term frequency matrix
vectorizer = TfidfVectorizer(use_idf=False, norm='l2',tokenizer=token, ngram_range=(1,3))
tfidf_matrix = vectorizer.fit_transform(nyt_list)

#separate the NYT and Reuters matrices. You will calculate distances between the
#elements of these matrices
nyt_keywords_tfidf=tfidf_matrix[0:1400,:].toarray()
reuters_keywords_tfidf=tfidf_matrix[1400:,:].toarray()

#calculate the distances between keywords. dump into pickle
keyword_distances=scipy.spatial.distance.cdist(nyt_keywords_tfidf,reuters_keywords_tfidf,'cosine')
qf=open('keyword_distances.pkl','wb')
pickle.dump(keyword_distances,qf)

#add keyword distances to your big df
big_df = pickle.load(open( "big_df.pkl", "rb" ))
flatten = lambda l:[item for sublist in l for item in sublist]
big_df['keyword_distances']=pd.Series(flatten(keyword_distances.tolist()))
wf=open('big_df.pkl','wb')
pickle.dump(big_df,wf)