# News Corpus Generator

This notebook is used data loading for the news headline dataset, which ends up being the most challenging dataset we evaluate against because of the need to understand semantic similarity a little more robustly.

In [1]:
import pandas as pd

In [2]:
news_df = pd.read_csv("./data/news-aggregator/newsCorpora.csv", sep="\t", names=["text", "url", "source", "?", "cluster_id", "url_2", "article_id"])
news_df.head()

Unnamed: 0,text,url,source,?,cluster_id,url_2,article_id
1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207
3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550
4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793
5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027


In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from tqdm import tqdm

catalog_id = 0

catalog_ids = []
catlaog_texts = []

queries_input_texts = []
queries_match_ids = []
queries_judgments = []

def process_data_df(data_df):
    global catalog_id
    for i, g in tqdm(data_df.groupby("cluster_id")):
        if len(g.index) < 3:
            continue
        original_text = g["text"].values[-1]
        candidate_texts = g["text"].iloc[1:].values

        vectorizer = CountVectorizer().fit_transform([original_text] + list(candidate_texts))
        vectors = vectorizer.toarray()

        csim = cosine_similarity(vectors)
        similarity_scores = csim[0,1:]
        similarity_scores[similarity_scores <= 0.2] += 1

        least_similar_index = np.argmin(similarity_scores)
        positive_text = candidate_texts[least_similar_index]
        negative_text = (news_df[news_df["cluster_id"] != i].sample(n=1)["text"].values[0])

        catalog_ids.append(catalog_id)
        catlaog_texts.append(positive_text)
        catalog_id += 1

        catalog_ids.append(catalog_id)
        catlaog_texts.append(negative_text)
        catalog_id += 1

        queries_input_texts.append(original_text)
        queries_match_ids.append(catalog_id-2)
        queries_judgments.append(True)

        queries_input_texts.append(original_text)
        queries_match_ids.append(catalog_id-1)
        queries_judgments.append(False)

    queries_df = pd.DataFrame({
        "input_text": queries_input_texts,
        "match_id": queries_match_ids,
        "judgment": queries_judgments
    })

    catalog_df = pd.DataFrame({
        "catalog_id": catalog_ids,
        "text": catlaog_texts,
    })
    
    return catalog_df, queries_df

In [4]:
groups = news_df['cluster_id'].unique()

np.random.shuffle(groups)

train_idx = int(len(groups) * 0.6)
valid_idx = int(len(groups) * 0.8)

train_groups = groups[:train_idx]
valid_groups = groups[train_idx:valid_idx]
test_groups = groups[valid_idx:]

train_df = news_df[news_df['cluster_id'].isin(train_groups)]
validation_df = news_df[news_df['cluster_id'].isin(valid_groups)]
test_df = news_df[news_df['cluster_id'].isin(test_groups)]

In [5]:
train_catalog_df, train_queries_df = process_data_df(train_df)
val_catalog_df, val_queries_df = process_data_df(validation_df)
test_catalog_df, test_queries_df = process_data_df(test_df)

100%|██████████| 4338/4338 [04:45<00:00, 15.20it/s]
100%|██████████| 1446/1446 [01:34<00:00, 15.34it/s]
100%|██████████| 1446/1446 [01:34<00:00, 15.31it/s]


In [6]:
train_catalog_df["text"] = train_catalog_df["text"].str.replace(" ...", "", regex=False).str.strip()
val_catalog_df["text"] = val_catalog_df["text"].str.replace(" ...", "", regex=False).str.strip()
test_catalog_df["text"] = test_catalog_df["text"].str.replace(" ...", "", regex=False).str.strip()

train_queries_df["input_text"] = train_queries_df["input_text"].str.replace(" ...", "", regex=False).str.strip()
val_queries_df["input_text"] = val_queries_df["input_text"].str.replace(" ...", "", regex=False).str.strip()
test_queries_df["input_text"] = test_queries_df["input_text"].str.replace(" ...", "", regex=False).str.strip()

In [7]:
train_catalog_df.to_csv("data/news-aggregator/train_catalog.csv", index=False)
train_queries_df.to_csv("data/news-aggregator/train_queries.csv", index=False)
val_catalog_df.to_csv("data/news-aggregator/val_catalog.csv", index=False)
val_queries_df.to_csv("data/news-aggregator/val_queries.csv", index=False)
test_catalog_df.to_csv("data/news-aggregator/test_catalog.csv", index=False)
test_queries_df.to_csv("data/news-aggregator/test_queries.csv", index=False)

In [8]:
train_catalog_df.head()["text"].values

array(["The Key to the Dow's Bull Market? Energy Stocks",
       'Dollar tumbles against yen',
       'Google Drive price cuts signal start of Cloud Storage Wars - What will Dropbox',
       'Gov. Cuomo Unveils Plan To Combat HIV, AIDS',
       'China Credit Gauge Declines as Officials Seek to Tame Debt Boom'],
      dtype=object)

In [9]:
train_queries_df.head()["input_text"].values

array(['Linn Energy (LINE) to Acquire US non-Core Oil, Gas Assets of Devon Energy',
       'Linn Energy (LINE) to Acquire US non-Core Oil, Gas Assets of Devon Energy',
       'Western Digital My Cloud EX2 Review: Cloud Storage Simplicity',
       'Western Digital My Cloud EX2 Review: Cloud Storage Simplicity',
       'China central bank to get tough on shadow financing'],
      dtype=object)