In [2]:
#import all you need
import numpy as np
import pandas as pd
import nltk
from sklearn.externals import joblib
import re
import codecs
from sklearn import feature_extraction
import os
from bs4 import BeautifulSoup
import urllib
from nltk.stem.snowball import SnowballStemmer 


#read the csv files and put them in pandas dataframes 
nyt_uniques=pd.read_csv('C:/Users/aditi/OneDrive/Desktop/Media_Bias/20180516-20180621_nyt_unique.csv')
reuters_uniques=pd.read_csv('C:/Users/aditi/OneDrive/Desktop/Media_Bias/20180516-20180621_reuters_unique.csv')

#define the stemmer you will be using in the functions below 
stemmer = SnowballStemmer("english") 

#define a funciton that both tokenizes and stems a given word. this will help us with the tfidf calculations
def token_stem(text):
    # tokenize by sentence and word. this way you ensure you get rid of punctuations
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens_with_letters = []
    # use the regex library to search only for items that contain letters. this will enable you to eliminate punctuation
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            tokens_with_letters.append(token)
    stems = [stemmer.stem(t) for t in tokens_with_letters] #"stems" part
    return stems

#define a function that only tokenizes a given word. this won't help us with calculations, but will help us if 
#we ever need to lookup a stemmed word and see what the original version was. redundant mostly:) 
def token(text): #difference between this one and the one above is the "stems" part
    #same as above
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    tokens_with_letters = []
    # same as above
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            tokens_with_letters.append(token)
    return tokens_with_letters



#put the articles into lists 
reuters_article_list=reuters_uniques['article'].tolist()
nyt_article_list=nyt_uniques['article'].tolist()

#get the stopwords. you will be eliminating them
stopwords = nltk.corpus.stopwords.words('english')


#define a vectorizer that will help you put the words in the articles in vectors corresponding to these articles 
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000, stop_words='english',use_idf=True, norm='l2',tokenizer=token_stem, ngram_range=(1,3))


#create a big list that is the union of the nyt articles and the reuters articles. you need to vectorize them together 
nyt_article_list.extend(reuters_article_list)


%time tfidf_matrix = vectorizer.fit_transform(nyt_article_list) #fit the vectorizer into the big list of articles

nyt_tfidf=tfidf_matrix[0:3406,:]
reuters_tfidf=tfidf_matrix[3406:,:]

#separate the big tfidf back into nyt_tfidf and reuters_tfidf. also put them in arrays because scipy.spatial.distance
#can only work with arrays. 
nyt_tfidf_array_test=nyt_tfidf[0:10,:].toarray()
reuters_tfidf_array_test=reuters_tfidf[0:100,:].toarray()


#calculate distances between nyt - reuters articles. note this takes a lot of time! 
#output of this will be a matrix where each row corresponds to a reuters article and each column correspods to a NYT 
#article and each entry corresponds to the distance between the two articles 
import scipy.spatial.distance
distances=scipy.spatial.distance.cdist(reuters_tfidf_array_test,nyt_tfidf_array_test,'cosine')



#next chunk of code if basically putting the whole thing into the csv format that we have for the NYT hero articles and 
#the reuters articles. cleaning and indexing and all. 
our_indices=np.where(distances <=1 )
reuters_indices=our_indices[0].tolist()
nyt_indices=our_indices[1].tolist()
our_values=pd.Series(distances[np.where(distances <=1)])
distances_df = pd.concat([reuters_uniques['article'][reuters_indices].reset_index(),
                          reuters_uniques['date'][reuters_indices].reset_index(),
                          reuters_uniques['keywords'][reuters_indices].reset_index(),
                          nyt_uniques['article'][nyt_indices].reset_index(),
                          nyt_uniques['date'][nyt_indices].reset_index(),
                          nyt_uniques['keywords'][nyt_indices].reset_index(),our_values]
                         , axis=1)
distances_df.columns=['index_reuters','article_reuters','index_reuters_1','date_reuters','index_reuters_2',
                     'keywords_reuters','index_nyt','article_nyt','index_nyt_1','date_nyt','index_nyt_2','keywords_nyt',
                     'distance']
distances_df=distances_df[['index_reuters','article_reuters','date_reuters','keywords_reuters','index_nyt',
                         'article_nyt','date_nyt','keywords_nyt','distance']]
distances_df.to_csv('distances_nyt_Reuters.csv')

Wall time: 10min 44s
