# Preprocess

## Load data

In [1]:
import pandas as pd

In [2]:
with open('data/newsCorpora.csv', 'r') as file:
    content = file.readlines()
data = [line.split('\t') for line in content]
df = pd.DataFrame(data=data, columns=['ID', 'TITLE', 'URL', 'PUBLISHER', 'CATEGORY', 'STORY', 'HOSTNAME', 'TIMESTAMP'])
df.set_index('ID', inplace=True, drop=True)
df.head()

Unnamed: 0_level_0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698\n
2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207\n
3,US open: Stocks fall after Fed official hints ...,http://www.ifamagazine.com/news/us-open-stocks...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371550\n
4,"Fed risks falling 'behind the curve', Charles ...",http://www.ifamagazine.com/news/fed-risks-fall...,IFA Magazine,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.ifamagazine.com,1394470371793\n
5,Fed's Plosser: Nasty Weather Has Curbed Job Gr...,http://www.moneynews.com/Economy/federal-reser...,Moneynews,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.moneynews.com,1394470372027\n


In [3]:
new_df = df[df['PUBLISHER'].apply(lambda row: True if row in ["Reuters", "Huffington Post", "Businessweek", "Contactmusic.com", "Daily Mail"] else False)]

## Check dup

In [4]:
new_df.describe()

Unnamed: 0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
count,13356,13356,13356,13356,13356,13356,13356
unique,13104,13356,5,4,4579,6,13356
top,Deals of the day- Mergers and acquisitions,http://in.reuters.com/article/2014/03/10/eu-ba...,Reuters,b,d_yCfTJxDUFGs_MQrL1DnBRuBd_eM,in.reuters.com,1394470501755\n
freq,7,1,3904,5627,31,2558,1


In [5]:
# remove duplicate
new_df = new_df.drop_duplicates(subset=['TITLE', 'CATEGORY'])
new_df.describe()

Unnamed: 0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
count,13113,13113,13113,13113,13113,13113,13113
unique,13104,13113,5,4,4527,6,13113
top,From IPO To Rental Boyfriends: Everything You ...,http://in.reuters.com/article/2014/03/10/eu-ba...,Reuters,b,d_yCfTJxDUFGs_MQrL1DnBRuBd_eM,in.reuters.com,1394470501755\n
freq,2,1,3795,5523,30,2502,1


In [6]:
for record in new_df[new_df.duplicated(subset=['TITLE'])][['TITLE', 'CATEGORY']].to_dict(orient='records'):
    print(f'{record["CATEGORY"]} :: {record["TITLE"]}')

t :: From IPO To Rental Boyfriends: Everything You Need To Know About China's  ...
t :: Google Will Deliver All The Costco Groceries You Want For $5
t :: Sunday Roundup
e :: UPDATE 1-New York Times publisher denies sexism, calls Abramson bad  ...
e :: Jill Abramson Backs Out Of Brandeis Commencement Ceremony, Will Still  ...
m :: Angelina Jolie Effect: Doctors warn over worrying rise in double mastectomies
e :: US STOCKS-Futures drop as Iraq turmoil continues
t :: Same engineer designed switches on 5.95 million recalled GM cars
t :: UPDATE 2-Twitter names former Goldman executive Noto as CFO


In [7]:
new_df[df['TITLE'] == "Jill Abramson Backs Out Of Brandeis Commencement Ceremony, Will Still  ..."][['TITLE', 'CATEGORY']].to_dict(orient='records')

  new_df[df['TITLE'] == "Jill Abramson Backs Out Of Brandeis Commencement Ceremony, Will Still  ..."][['TITLE', 'CATEGORY']].to_dict(orient='records')


[{'TITLE': 'Jill Abramson Backs Out Of Brandeis Commencement Ceremony, Will Still  ...',
  'CATEGORY': 'b'},
 {'TITLE': 'Jill Abramson Backs Out Of Brandeis Commencement Ceremony, Will Still  ...',
  'CATEGORY': 'e'}]

In [8]:
remove_list = [
    {'TITLE': "From IPO To Rental Boyfriends: Everything You Need To Know About China's  ...", 'CATEGORY': 't'},
    {'TITLE': 'Google Will Deliver All The Costco Groceries You Want For $5', 'CATEGORY': 't'},
    {'TITLE': 'Sunday Roundup', 'CATEGORY': 't'},
    {'TITLE': 'UPDATE 1-New York Times publisher denies sexism, calls Abramson bad  ...', 'CATEGORY': 'b'},
    {'TITLE': 'Jill Abramson Backs Out Of Brandeis Commencement Ceremony, Will Still  ...', 'CATEGORY': 'b'},
    {'TITLE': 'Angelina Jolie Effect: Doctors warn over worrying rise in double mastectomies', 'CATEGORY': 'e'},
    {'TITLE': 'US STOCKS-Futures drop as Iraq turmoil continues', 'CATEGORY': 'e'},
    {'TITLE': 'Same engineer designed switches on 5.95 million recalled GM cars', 'CATEGORY': 'b'},
    {'TITLE': 'UPDATE 2-Twitter names former Goldman executive Noto as CFO', 'CATEGORY': 't'}
]

def check(row: pd.Series) -> bool:
    for record in remove_list:
        if (record['TITLE'] == row['TITLE']) and (record['CATEGORY'] == row['CATEGORY']):
            return False
    return True

In [9]:
new_df = new_df[new_df.apply(lambda row: check(row), axis=1)]
new_df.describe()

Unnamed: 0,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
count,13104,13104,13104,13104,13104,13104,13104
unique,13104,13104,5,4,4526,6,13104
top,Europe reaches crunch point on banking union,http://in.reuters.com/article/2014/03/10/eu-ba...,Reuters,b,d_yCfTJxDUFGs_MQrL1DnBRuBd_eM,in.reuters.com,1394470501755\n
freq,1,1,3791,5520,30,2501,1


## Train, Val, Test Split

In [10]:
from sklearn.model_selection import train_test_split

# stratify sampling
train_df, test_df = train_test_split(new_df, test_size=0.1, random_state=42, stratify=new_df[['CATEGORY']])
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df[['CATEGORY']])

In [11]:
# new_df = new_df.sample(frac=1, random_state=42)[['TITLE', 'CATEGORY']]
# df_len = len(new_df)
# train_df, val_df, test_df = new_df[:int(0.8*df_len)], new_df[int(0.8*df_len):int(0.9*df_len)], new_df[int(0.9*df_len):]

## Save

In [12]:
category_mapping = {'b': 'business', 't': 'science_and_technology', 'e': 'entertainment', 'm': 'health'}

In [13]:
def write(df: pd.DataFrame, path: str) -> None:
    records = df.to_dict(orient='records')
    with open(path, "w") as file:
        for record in records:
            file.write("\t".join([record['TITLE'], category_mapping[record['CATEGORY']]]) + '\n')

write(train_df, 'data/train.txt')
write(val_df, 'data/valid.txt')
write(test_df, 'data/test.txt')