## Load Data

In [11]:
import gensim
import nltk
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt

bbc_train = pd.read_csv('kaggle_data/BBC News Train.csv')
bbc_test = pd.read_csv('kaggle_data/BBC News Test.csv')
display(bbc_train)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


## Data Cleaning

In [2]:
## The BBC news text is already in lower case.
## As a first cleaning step I will remove all characters from tokens except lower case alpha and hyphen,
## and then keep only non-empty tokens.
match_non_alpha = re.compile(r'[^-a-z]+')

def clean_words(text):
    tokens = text.split(' ')
    cleaned = []
    for token in tokens:
        new_token = match_non_alpha.sub(r'',token)
        ## Also remove single letter tokens, 
        ## Also remove English stop words from scikit-learn feature_extraction
        if len(new_token) > 1 and new_token not in ENGLISH_STOP_WORDS:
            cleaned.append(new_token)
    clean_text = ' '.join(cleaned)
    return clean_text
        
## function to count words in text
def count_words(text):
    return len (text.split(' '))

bbc_train['CleanedText'] = bbc_train['Text'].apply(clean_words)
bbc_test['CleanedText'] = bbc_test['Text'].apply(clean_words)
display(bbc_train)
display(bbc_test)

Unnamed: 0,ArticleId,Text,Category,CleanedText
0,1833,worldcom ex-boss launches defence lawyers defe...,business,worldcom ex-boss launches defence lawyers defe...
1,154,german business confidence slides german busin...,business,german business confidence slides german busin...
2,1101,bbc poll indicates economic gloom citizens in ...,business,bbc poll indicates economic gloom citizens maj...
3,1976,lifestyle governs mobile choice faster bett...,tech,lifestyle governs mobile choice faster better ...
4,917,enron bosses in $168m payout eighteen former e...,business,enron bosses payout eighteen enron directors a...
...,...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment,double eviction big brother model caprice holb...
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment,dj double act revamp chart dj duo jk joel taki...
1487,1590,weak dollar hits reuters revenues at media gro...,business,weak dollar hits reuters revenues media group ...
1488,1587,apple ipod family expands market apple has exp...,tech,apple ipod family expands market apple expande...


Unnamed: 0,ArticleId,Text,CleanedText
0,1018,qpr keeper day heads for preston queens park r...,qpr keeper day heads preston queens park range...
1,1319,software watching while you work software that...,software watching work software monitor keystr...
2,1138,d arcy injury adds to ireland woe gordon d arc...,arcy injury adds ireland woe gordon arcy ruled...
3,459,india s reliance family feud heats up the ongo...,india reliance family feud heats ongoing publi...
4,1020,boro suffer morrison injury blow middlesbrough...,boro suffer morrison injury blow middlesbrough...
...,...,...,...
730,1923,eu to probe alitalia state aid the european ...,eu probe alitalia state aid european commissio...
731,373,u2 to play at grammy awards show irish rock ba...,play grammy awards irish rock band play live g...
732,1704,sport betting rules in spotlight a group of mp...,sport betting rules spotlight group mps peers ...
733,206,alfa romeos to get gm engines fiat is to sto...,alfa romeos gm engines fiat stop making six-cy...


## Using TF-IDF to extract features from each document

In [10]:
%%time

## TF-IDF Hyper Parameters which may be tuned

NUMBER_OF_CATEGORIES = 5  ## We know that we need to infer one of 5 news categories
MAX_FEATURES = 5000 ## Build a vocabulary using the only the top max_features ordered by term frequency across the corpus.
MIN_DF = 3 ## Minimum Term Frequency Per Document. Ignore terms that appear less than this number of times in a given document
MAX_DF = 0.85 ## Maximum Document Frequency. Ignore terms that appear in more than this proportion of all documents
NGRAM_RANGE = (1,2)

## Create the tf-idf vectors for all the train and test documents as one corpus
tfidf_vectorizer = TfidfVectorizer(
    min_df=MIN_DF,
    max_df=MAX_DF,
    max_features=MAX_FEATURES,
    ngram_range=NGRAM_RANGE,
)


train_docs = list(bbc_train['CleanedText'])
test_docs = list(bbc_test['CleanedText'])
all_docs = train_docs.copy()
all_docs.extend(test_docs)

train_len =len(train_docs)
test_len = len(test_docs)
all_docs_len = len(all_docs)
print(f'# train docs = {train_len}')
print(f'# test docs = {test_len}')
print(f'# all docs = {all_docs_len}')

tfidf = tfidf_vectorizer.fit_transform(all_docs)

# Save the feature names for later to create topic summaries
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

document_features = tfidf_vectorizer.transform(all_docs)
print(f'{document_features.shape=}')

train_features = document_features[:train_len,:]
test_features = document_features[train_len:,:]
print(f'{train_features.shape=}')
print(f'{test_features.shape=}')




# train docs = 1490
# test docs = 735
# all docs = 2225
document_features.shape=(2225, 5000)
train_features.shape=(1490, 5000)
test_features.shape=(735, 5000)
CPU times: total: 2.02 s
Wall time: 2.02 s


## Random Forest Model

In [29]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=15000
MAX_DEPTH=15
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.946979865771812

CPU times: total: 1min 26s
Wall time: 29.9 s


In [34]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=15000
MAX_DEPTH=15
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9463087248322147

CPU times: total: 1min 28s
Wall time: 30.6 s


In [35]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=20000
MAX_DEPTH=15
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9483221476510068

CPU times: total: 1min 55s
Wall time: 40.3 s


In [36]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=15000
MAX_DEPTH=20
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9550335570469799

CPU times: total: 1min 45s
Wall time: 34 s


In [39]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=15000
MAX_DEPTH=20
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9523489932885906

CPU times: total: 1min 48s
Wall time: 33.8 s


In [37]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=15000
MAX_DEPTH=20
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9523489932885906

CPU times: total: 1min 49s
Wall time: 33.8 s


In [38]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=15000
MAX_DEPTH=25
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9530201342281879

CPU times: total: 2min 8s
Wall time: 36.3 s


In [30]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=20000
MAX_DEPTH=20
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9536912751677852

CPU times: total: 2min 24s
Wall time: 46.1 s


In [33]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=20000
MAX_DEPTH=20
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9530201342281879

CPU times: total: 2min 24s
Wall time: 46.2 s


In [31]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=20000
MAX_DEPTH=25
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9530201342281879

CPU times: total: 2min 36s
Wall time: 51.6 s


In [32]:
%%time
from sklearn.ensemble import RandomForestClassifier

## Tunable parameters
N_ESTIMATORS=25000
MAX_DEPTH=20
N_JOBS=-1

model = RandomForestClassifier(
    n_estimators=N_ESTIMATORS,
    max_depth=MAX_DEPTH,
    oob_score=True,
    n_jobs=N_JOBS,
)

model.fit(train_features, bbc_train['Category'])

display(model.oob_score_)

0.9530201342281879

CPU times: total: 2min 56s
Wall time: 59.6 s


In [49]:
%%time
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
def random_subset_training(
    features, labels, 
    n_tries=5, label_subset_fractions=[0.5, 0.2, 0.1, 0.05], n_estimators=20000, max_depth=20):
   
    df1 = pd.DataFrame({'label_fraction': [], 'accuracy': []})
    results_list =  [df1]
    for fraction in label_subset_fractions:
        for try1 in range(n_tries):
            X_train, X_test, y_train, y_test = train_test_split(features, labels, train_size=fraction)
            model = RandomForestClassifier(
                n_estimators=n_estimators,
                max_depth=max_depth,
                n_jobs=-1,
            )
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            score = accuracy_score(y_test, y_pred)
            result = pd.DataFrame({'label_fraction': [fraction], 'accuracy': [score]})
            results_list.append(result)
    df = pd.concat(results_list, ignore_index=True)
    return df

subset_training  = random_subset_training(train_features, bbc_train['Category'], n_tries=10)
display(subset_training)
subset_training.to_csv('./subset_training.csv')

Unnamed: 0,label_fraction,accuracy
0,0.5,0.950336
1,0.5,0.943624
2,0.5,0.946309
3,0.5,0.938255
4,0.5,0.930201
5,0.5,0.94094
6,0.5,0.939597
7,0.5,0.931544
8,0.5,0.948993
9,0.5,0.947651


CPU times: total: 41min 45s
Wall time: 21min 57s


In [59]:
grouped = subset_training.groupby('label_fraction')['accuracy'].mean().round(3)

In [60]:
display(grouped)
grouped.reset_index().to_csv('data/AccuracyVersusFractionOfLabelsUsed',index=False)

label_fraction
0.05    0.741
0.10    0.865
0.20    0.930
0.50    0.942
Name: accuracy, dtype: float64

In [63]:
df = pd.read_csv('data/AccuracyVersusFractionOfLabelsUsed')
display(df.sort_values(by='label_fraction',ascending=False))

Unnamed: 0,label_fraction,accuracy
3,0.5,0.942
2,0.2,0.93
1,0.1,0.865
0,0.05,0.741
