In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import documents as docs
import textcomparisons as tc
import random
import time

start = time.time()

data_folder = "data"
article_files = ["articles2019-06-01_" + str(i) + "-" + str(i + 5000) + ".csv" for i in range(0, 100000, 5000)]
article_files = article_files + ["articles2019-06-01_100000-100755.csv"]

def readArticles(path):
    """ Reads df of articles from the given path, and adds a column
    to store the Document-processed article """
    article_df = pd.read_csv(path)
    article_df["doc"] = None
    return article_df

# "login", 
keywords = ["subscription", "subscribe", "full access", "digital access", "sign up", "unlimited access", "unlimited digital access", "log in", "sign up"]
def keywordsin(str):
    for word in keywords:
        if word in str:
            return True
    return False

def isPaywall(i, to_doc = True):
    text = article_df.loc[i, "text"]
    text = text.lower()
    if len(text) < 500:
        article_df.loc[i, "paywall"] += 0.5
    if len(text) < 1000 and keywordsin(text):
        article_df.loc[i, "paywall"] += 1
    if to_doc and article_df.loc[i, "doc"] is None:
        article_df.loc[i, "doc"] = docs.Document(text, clean = False)
    return article_df.loc[i, "paywall"] > 0

def dict_by_ids(df, ids):
    """ Given a dataframe of articles and a list of article ids, 
    returns a dictionary with ids as keys and Documents as items, 
    computing and storing the Documents back in the df as needed
    """
    doc_dict = {}
    for doc_id in ids:
        row = df["id"] == doc_id
        doc = df.loc[row, "doc"].iloc[0]
        if doc is None:
            doc = docs.Document(df.loc[row, "text"].iloc[0], clean = False)
            df.loc[row, "doc"] = doc
        doc_dict[doc_id] = doc
    return doc_dict

def subsetmat(mat, inds):
    ''' Returns subset matrix of symmetric matrix mat, using inds
    '''
    subset = np.zeros((len(inds), len(inds)))
    for i in range(len(inds)):
        for j in range(len(inds)):
            subset[i, j] = mat[inds[i], inds[j]]
    return subset

try:
    article_df = pd.read_pickle("article_df_20190601")
except:
    article_df = [readArticles(os.path.join(data_folder, file)) for file in article_files]
    article_df = pd.concat(article_df)
    article_df = article_df.reset_index(drop = True)

    article_df["paywall"] = 0

events = [event for event in np.unique(article_df["event"]) if not np.isnan(event)]
n = [len(article_df.loc[article_df["event"] == event]) for event in events]
print("Event sizes: ", n)

try:
    results_df = pd.read_csv("results_20190601_clusters_temp.csv")
except: 
    results_df = pd.DataFrame(list(zip(events, n)), columns = ["event", "n"])
    results_df["unique25"] = np.nan
    results_df["unique75"] = np.nan
    results_df["n_good"] = np.nan
    results_df["unique25_good"] = np.nan
    results_df["unique75_good"] = np.nan

ac = tc.ArticleComparisons(thresh_jaccard = .5, thresh_same_sent = .9, thresh_same_doc = .25)
print("Setup time: %d seconds" % np.round(time.time() - start))

Event sizes:  [3175, 7333, 3280, 790, 750, 479, 1267, 2399, 269, 464, 187, 543, 580, 272, 1027, 377, 286, 391, 997, 269, 907, 272, 287, 351, 131, 333, 275, 503, 1067, 135, 691, 225, 123, 241, 263, 108, 140, 70, 239, 365, 172, 79, 244, 68, 68, 54, 69, 172, 172, 165]
Setup time: 10 seconds


In [2]:
i = 23
sample = np.array(article_df.loc[article_df["event"] == events[i], "id"])
article_dict = dict_by_ids(article_df, sample)
good_inds = [i for i in range(len(sample)) if not isPaywall(sample[i])]
results_df.loc[i, "n_good"] = len(good_inds)

In [None]:
clustering = ac.cluster_articles(article_dict, plot = False)
results_df.loc[i, "unique25"] = ac.prop_unique_clusters(thresh_same_doc = 0.25)
results_df.loc[i, "unique25_good"] = ac.prop_unique_clusters(thresh_same_doc = 0.25, inds = good_inds)
results_df.loc[i, "unique75"] = ac.prop_unique_clusters(thresh_same_doc = 0.75)
results_df.loc[i, "unique75_good"] = ac.prop_unique_clusters(thresh_same_doc = 0.75, inds = good_inds)
results_df.iloc[i, :]

0 / 351 done, 0.0 seconds elapsed


In [None]:
print(np.where(np.array([len(i.get_bow_sentences()) for i in article_dict.values()]) == 0))

In [None]:
list(article_dict.keys())[198]

In [None]:
article_df.iloc[57742, :]

In [None]:
article_dict

In [None]:
bad_ids = [i for i in sample if isPaywall(i)]
good_ids = [i for i in sample if i not in bad_ids]
for i in bad_ids:
    print(i, "\n", article_dict[i], "\n")
ac.prop_unique_clusters(inds = bad_inds)
# ac.display_mat(jsm_b, xlabs = bad_inds)

In [None]:
article_dict_good = dict_by_ids(article_df, good_ids)
jsm = ac.jac_score_mat(article_dict_good)
hc = ac.cluster_articles(plot = True)
# ac.display_mat(jsm, xlabs = list(article_dict.keys()))
n_subset = 20
ac.display_mat(subsetmat(jsm, list(range(n_subset))), xlabs = list(article_dict_good.keys())[0:n_subset])

In [None]:
id1 = 1201
id2 = 21618
print("Good articles: %d" % id1, id1 in good_ids, id2, id2 in good_ids)
ac.display_mat(ac.get_match_matrix(article_dict[id1], article_dict[id2]))
ac.print_sentence_matches()

In [None]:
len(str(article_dict[id1]))

In [None]:
for doc in article_dict:
    print(doc, article_dict[doc])

In [None]:
start = time.time()
tmp = pd.read_pickle("article_df_20190601")
print(time.time() - start)
# pickle: 3.31s, csv = 26s, picke = 12.7s

In [None]:
tmp