# Text Classification
Data comes from BBC articles which are available to download from Kaggle [here](https://www.kaggle.com/shivamkushwaha/bbc-full-text-document-classification).

In [28]:
import os
from collections import defaultdict
from pathlib import Path
import pandas as pd
import numpy as np

# display multiple outputs in same cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [45]:
path_folder = '../data/bbc_classification/bbc'

# list of folders for each BBC article
list_folders = os.listdir(path = path_folder)
list_folders = [e for e in list_folders if e not in {'.DS_Store', 'README.TXT'}]

# nested list of files within each folder
list_files = []
for folder in list_folders:
    files = os.listdir(path = path_folder + '/' + folder + '/')
    list_files.append(files)
    
# intialise dictionary with keys from list_folders an values from list_files
dict_files = dict(zip(list_folders, list_files))

Now have a dictionary, `dict_files` whose:
 - **Keys**: The folder name the data files belong to
 - **Values**: The data files within each of the sub-folders

What we want to do now is load in each text file as a *single-entry* and assign it the `key` from `dict_files` as the classification. This enables us to create a labelled dataset.

In [46]:
results = defaultdict(list)
df_bbc = pd.DataFrame(columns = ['category', 'article_text'])
for folder in list_folders:
    path_dir = path_folder + '/' + folder
    print(path_dir)
    for file in Path(path_dir).iterdir():
        with open(file, 'rb') as file_open:
            results['category'] = folder
            results['article_text'].append(file_open.read())
    df = pd.DataFrame(results)
    df_bbc = df_bbc.append(df)

../data/bbc_classification/bbc/entertainment
../data/bbc_classification/bbc/business
../data/bbc_classification/bbc/sport
../data/bbc_classification/bbc/politics
../data/bbc_classification/bbc/tech


In [47]:
df_bbc[df_bbc['category'] == 'entertainment']

Unnamed: 0,category,article_text
0,entertainment,b'Musicians to tackle US red tape\n\nMusicians...
1,entertainment,"b'U2\'s desire to be number one\n\nU2, who hav..."
2,entertainment,b'Rocker Doherty in on-stage fight\n\nRock sin...
3,entertainment,"b""Snicket tops US box office chart\n\nThe film..."
4,entertainment,b'Ocean\'s Twelve raids box office\n\nOcean\'s...
...,...,...
381,entertainment,b'Bookmakers back Aviator for Oscar\n\nThe Avi...
382,entertainment,b'Scissor Sisters triumph at Brits\n\nUS band ...
383,entertainment,b'Spears seeks aborted tour payment\n\nSinger ...
384,entertainment,b'Angels \'favourite funeral song\'\n\nAngels ...
