Dataset
==

In [1]:
%matplotlib inline
%reload_ext autoreload
%autoreload 2

Unable to revert mtime: /Library/Fonts


In [29]:
#load needed packages
import pandas as pd
import random 
import os
from dataclasses import dataclass, field
from typing import List

In [3]:
#make directory for data 
# DATA_PATH = 'data/'
# os.makedirs(DATA_PATH, exist_ok=True)
# RAW_DATA = f'{DATA_PATH}raw_data/'
# os.makedirs(RAW_DATA, exist_ok=True)

## 1. Loading Dataset


### 1.1. Download Dataset
First we download and extract the gigaword dataset (~3M) [here](https://drive.google.com/file/d/0B6N7tANPyVeBNmlSX19Ld2xDU1E/view?usp=sharing)

Meantime using the BBC dataset (7M) here: https://www.kaggle.com/pariza/bbc-news-summary/download

Data downloaded and stored under `data/BBC News Summary`

In [4]:
#download data (CNN)
# !curl --header "Host: doc-0c-3o-docs.googleusercontent.com" --header "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36" --header "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" --header "Accept-Language: en-US,en;q=0.9" --header "Cookie: AUTH_s4a6oitvorbtivfcjm2pefm907l1ntir=08690680304265769485|1531648800000|pm1jthdhng09cikkb0pkdcqlod4d76p8" --header "Connection: keep-alive" "https://doc-0c-3o-docs.googleusercontent.com/docs/securesc/d1n9duui70mcvt9ph3953bv4foh1d3fm/pb1h3k6beg14nfv16poorm9mr6bl90e3/1531656000000/03129501499031348422/08690680304265769485/0B6N7tANPyVeBNmlSX19Ld2xDU1E?e=download" -o "summary.tar.gz" -L



In [5]:
#extract files 
# !tar -xzf summary.tar.gz -C {RAW_DATA} && mv summary.tar.gz {RAW_DATA}
# !gunzip {RAW_DATA}sumdata/train/*.*.txt.gz 

### 1.2. Import Datasets

In [40]:
@dataclass()
class BBCNewsDataReader:
    base_folder: str
    exclusion: list = field(default_factory=list)
    
    @property
    def news_articles_folder(self):
        return self.base_folder + '/News Articles'
    
    @property
    def summaries_folder(self):
        return self.base_folder + '/Summaries'
    
    @property
    def categories(self):
        exclusion_folders = lambda x: x not in [".DS_Store"] + self.exclusion
        return filter(exclusion_folders, os.listdir(self.news_articles_folder))
    
    def to_df(self):
        df = pd.DataFrame(columns=['article', 'summary', 'category', 'filename'])
        for article_folder, summary_folder in self.__category_folders():
            category = article_folder.split('/')[-1]
            for filename in os.listdir(article_folder):
                article = self.__read_file(f'{article_folder}/{filename}')
                summary = self.__read_file(f'{summary_folder}/{filename}')
                df = df.append({'article': article, 'summary': summary, 'category': category, 'filename': filename}, ignore_index=True)
        return df
                
            
    def __category_folders(self):
        return [
            (f'{self.news_articles_folder}/{category}', f'{self.summaries_folder}/{category}') for category in self.categories
        ]
    
    
    def __read_file(self, filepath):
        with open(filepath) as file:
            return file.read()


In [41]:
data = BBCNewsDataReader(
        base_folder='data/BBC News Summary',
        exclusion=['entertainment', 'tech', 'sport', 'politics'] # remove these to read all data
    ).to_df()

In [42]:
data.head()

Unnamed: 0,article,summary,category,filename
0,UK economy facing 'major risks'\n\nThe UK manu...,"""Despite some positive news for the export sec...",business,289.txt
1,Aids and climate top Davos agenda\n\nClimate c...,"At the same time, about 100,000 people are exp...",business,504.txt
2,Asian quake hits European shares\n\nShares in ...,The unfolding scale of the disaster in south A...,business,262.txt
3,India power shares jump on debut\n\nShares in ...,"Shares in India's largest power producer, Nati...",business,276.txt
4,Lacroix label bought by US firm\n\nLuxury good...,LVMH said the French designer's haute couture ...,business,510.txt


In [None]:
# Add Train, Test and Validation Split

In [10]:
#join validation sentence pairs together into dataframe
val = pd.concat([pd.read_csv(f'{RAW_DATA}sumdata/train/valid.article.filter.txt', sep="\n"), 
                  pd.read_csv(f'{RAW_DATA}sumdata/train/valid.title.filter.txt', sep="\n")], axis=1)
val.columns = ["article", "title"]


In [10]:
#join validation sentence pairs together into dataframe
# val = pd.concat([pd.read_csv(f'{RAW_DATA}sumdata/train/valid.article.filter.txt', sep="\n"), 
#                   pd.read_csv(f'{RAW_DATA}sumdata/train/valid.title.filter.txt', sep="\n")], axis=1)
# val.columns = ["article", "title"]


In [13]:
len(data), len(val)

(3803956, 189650)

## 2. Preprocessing Dataset

### 2.1. Looking at the data

In [26]:
for i in range(5):
    r = random.randint(0,50)
    display(data.iloc[r]['article'])
    display(data.iloc[r]['title'])


'new zealand share prices closed #.## percent higher monday in subdued trading ahead of a us holiday , dealers said .'

'new zealand stocks close #.## percent higher'

"kenyan police have a mounted a ##-hour patrol in a village where us presidential candidate barack obama 's grandmother lives after robbers made a botched attempt to rob her solar panel , an official said ."

"kenyan police offer obama 's grandmother security after robbery"

'an israeli military helicopter with a two-man crew crashed near the northern town of afula and burst into flames on wednesday , army radio said .'

'military helicopter crashes in israel'

"bolivian president evo morales on wednesday declared the us ambassador to la paz `` persona non grata , '' accusing the envoy of encouraging the breakup of the country by promoting separatism ."

'bolivia president orders us envoy expelled'

'the algerian cabinet chaired by president abdelaziz bouteflika on sunday adopted the #### finance bill predicated on an oil price of ## dollars a barrel and a growth rate of #.# percent , it was announced here .'

'algeria adopts #### finance bill with oil put at ## dollars a barrel'

### Save data in tabular format

In [11]:
PROCESSED_DATA_PATH = f'{DATA_PATH}processed_data/'
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)

In [12]:
#save train, val datasets 
data.to_csv(f'{PROCESSED_DATA_PATH}train_ds.csv', index=None)
val.to_csv(f'{PROCESSED_DATA_PATH}valid_ds.csv', index=None)

### Create Sample Data

In [5]:
SAMPLE_DATA_PATH = f'{DATA_PATH}sample_data/'
os.makedirs(SAMPLE_DATA_PATH, exist_ok=True)

In [6]:
sample_train = data.sample(80000)
sample_val = val.sample(20000)

In [7]:
len(sample_train), len(sample_val)

(80000, 20000)

In [8]:
#save sample train, val, and test datasets 
sample_train.to_csv(f'{SAMPLE_DATA_PATH}train_ds.csv', index=None)
sample_val.to_csv(f'{SAMPLE_DATA_PATH}valid_ds.csv', index=None)

### Create Smaller Sample Data

In [9]:
sample_train_ = data.sample(64)
sample_val_ = val.sample(16)

In [10]:
len(sample_train_), len(sample_val_)

(64, 16)

In [11]:
#save sample train, val, and test datasets 
sample_train_.to_csv(f'{SAMPLE_DATA_PATH}train_ds_.csv', index=None)
sample_val_.to_csv(f'{SAMPLE_DATA_PATH}valid_ds_.csv', index=None)

In [2]:
import pandas as pd
DATA_PATH = 'data/'
PROCESSED_DATA_PATH = f'{DATA_PATH}processed_data/'
data = pd.read_csv(f'{PROCESSED_DATA_PATH}train_ds.csv')
val = pd.read_csv(f'{PROCESSED_DATA_PATH}valid_ds.csv')
