In [1]:
!pip install xgboost --quiet
!pip install keras --quiet

In [2]:
import os
import glob
import pandas as pd
import numpy as np
import operator 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def check_missing_vals(dframe) -> list:
    return sorted([(c, dframe[c].dtype, dframe[c].isnull().sum()) 
                        for c in dframe.columns if dframe[c].isnull().sum() > 0], 
                      key=lambda x: x[1], reverse=True
                )

def metric_report(yTest, yPred):
    print(f'accuracy: {accuracy_score(yTest, yPred)}')
    print('')
    print("Classification report: \n\n", metrics.classification_report(yTest, yPred))
    print('')
    confusion_matrix = metrics.confusion_matrix(yTest, yPred)
    print("Confusion matrix: \n\n", confusion_matrix)

    
def clean_txt(txtCol: pd.Series) -> pd.Series:
    sw = set(stopwords.words('English'))

    text = txtCol.apply(gensim.utils.simple_preprocess, min_len=3)
    text = text.apply(lambda s: [w for w in s if w not in sw]) 
    text = text.apply(lambda s: [SnowballStemmer("english", ignore_stopwords=True).stem(w) for w in s])
    text = text.apply(lambda s: ['_'.join(x) for x in nltk.bigrams(s)] + s)
    
    return text

In [4]:
def get_data(path, exclude) -> pd.DataFrame:
    """
    get all txt data, put in list of dicts and return a dataframe
    """
    data = []

    for file in os.listdir(path):
        if file not in (exclude):
            full_filename = os.path.join(path, file)
            for news in os.listdir(full_filename):
                with open(os.path.join(full_filename, news), 'rb') as txt_file:
                    data.append({'NewsText': txt_file.read(), 'NewsType': file})

    return pd.DataFrame(data)

import pickle

def picklefy(txtCol: pd.DataFrame) -> pd.DataFrame:
    if [f for f in os.listdir('.') if f.endswith('p')]:
        return pickle.load(open('tfidf.p','rb'))
    return pickle.dump(clean_txt(txtCol['NewsText']), open('tfidf.p', 'wb')) 

In [5]:
# preemtively drop duplicates
df = get_data('bbc/', 'README.TXT').drop_duplicates(); df.head(2)

Unnamed: 0,NewsText,NewsType
0,b'Musicians to tackle US red tape\n\nMusicians...,entertainment
1,"b'U2\'s desire to be number one\n\nU2, who hav...",entertainment


In [6]:
df.describe()

Unnamed: 0,NewsText,NewsType
count,2127,2127
unique,2127,5
top,b'Musicians to tackle US red tape\n\nMusicians...,sport
freq,1,505


In [7]:
# vectorize
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5, norm='l2', 
                        encoding='latin-1', 
                        #ngram_range=(1, 2),
                        stop_words='english')

In [8]:
cleanTxtCol = picklefy(df).apply(lambda x: ' '.join(i for i in x))

features = tfidf.fit_transform(cleanTxtCol).toarray() 
labels = df.NewsType 

In [32]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

X = features
y = LE.fit_transform(labels)

# train/test/split
X_train, X_test, y_train, y_test = train_test_split(
                                            X,y, test_size=0.2
                                    )

In [42]:
le_name_mapping = dict(zip(LE.classes_, LE.transform(LE.classes_)))
print(le_name_mapping)

{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}


In [10]:
# decision tree

from sklearn import tree

clf = tree.DecisionTreeClassifier()

In [11]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.8215962441314554

Classification report: 

               precision    recall  f1-score   support

           0       0.77      0.78      0.77        90
           1       0.79      0.74      0.76        72
           2       0.83      0.90      0.86        84
           3       0.93      0.87      0.90       108
           4       0.76      0.79      0.78        72

    accuracy                           0.82       426
   macro avg       0.82      0.82      0.81       426
weighted avg       0.82      0.82      0.82       426


Confusion matrix: 

 [[70  5  9  1  5]
 [ 8 53  2  1  8]
 [ 2  0 76  2  4]
 [ 6  3  4 94  1]
 [ 5  6  1  3 57]]


In [12]:
import xgboost as xgb
from xgboost import XGBClassifier
xg_clf = XGBClassifier(random_state=42, use_label_encoder=False)

In [13]:
xgb_clf = xg_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.9530516431924883

Classification report: 

               precision    recall  f1-score   support

           0       0.92      0.94      0.93        90
           1       0.99      0.93      0.96        72
           2       0.93      0.95      0.94        84
           3       0.98      0.99      0.99       108
           4       0.94      0.93      0.94        72

    accuracy                           0.95       426
   macro avg       0.95      0.95      0.95       426
weighted avg       0.95      0.95      0.95       426


Confusion matrix: 

 [[ 85   0   4   0   1]
 [  1  67   2   1   1]
 [  2   0  80   1   1]
 [  0   0   0 107   1]
 [  4   1   0   0  67]]


In [14]:
# check size
print(len(X),len(y))

2127 2127


In [15]:
from scipy import stats
from sklearn.model_selection import RandomizedSearchCV, KFold

In [22]:
from sklearn.metrics import f1_score

In [25]:
import time

clf_xgb = XGBClassifier(objective = 'multi:softmax')
param_dist = {'n_estimators': stats.randint(150, 500),
              'learning_rate': stats.uniform(0.01, 0.07),
              #'subsample': stats.uniform(0.3, 0.7),
              'max_depth': [3, 4, 5, 6, 7, 8, 9],
              #'colsample_bytree': stats.uniform(0.5, 0.45),
              #'min_child_weight': [1, 2, 3]
             }

clf = RandomizedSearchCV(clf_xgb, 
                         param_distributions = param_dist,
                         n_iter = 1, 
                         scoring = 'f1', 
                         error_score = 0, 
                         verbose = 3, 
                         n_jobs = -1)

folds = KFold(n_splits = 3, shuffle = True)

n = 1
estimators = []
results = np.zeros(len(X_train))
score = 0.0
for train_index, test_index in folds.split(X_train):
    print(n)
    x_train, X_test = X[train_index,:], X[test_index,:]
    y_train, y_test = y[train_index], y[test_index]
    clf.fit(X_train, y_train)

    estimators.append(clf.best_estimator_)
    results[test_index] = clf.predict(X_test)
    score += f1_score(y_test, results[test_index], average='macro')
    print(n, time.perf_counter())
    n+=1
score /= numFolds

1
Fitting 5 folds for each of 1 candidates, totalling 5 fits




1 7457.620751742
2
Fitting 5 folds for each of 1 candidates, totalling 5 fits




2 8087.104672213
3
Fitting 5 folds for each of 1 candidates, totalling 5 fits




3 9024.106877413


In [27]:
clf.best_params_

{'max_depth': 6}

In [28]:
xg_clf = XGBClassifier(random_state=42, use_label_encoder=False, **clf.best_params_)

In [29]:
xg_clf

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              enable_categorical=False, gamma=None, gpu_id=None,
              importance_type=None, interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=6,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=42, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=None,
              verbosity=None)

In [33]:
xgb_clf = xg_clf.fit(X_train, y_train)

y_pred = xgb_clf.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.9553990610328639

Classification report: 

               precision    recall  f1-score   support

           0       0.95      0.96      0.96       108
           1       0.93      0.93      0.93        61
           2       0.97      0.92      0.94        91
           3       0.97      0.99      0.98        97
           4       0.94      0.96      0.95        69

    accuracy                           0.96       426
   macro avg       0.95      0.95      0.95       426
weighted avg       0.96      0.96      0.96       426


Confusion matrix: 

 [[104   2   1   0   1]
 [  1  57   2   0   1]
 [  1   2  84   2   2]
 [  1   0   0  96   0]
 [  2   0   0   1  66]]


In [53]:
for row_index, (input, prediction, label) in enumerate(zip (X_test, y_pred, y_test)):
    if label == 4:
        if prediction != label:
            print('Row', row_index, 'has been classified as', get_dict_value(prediction), 'and should be ', get_dict_value(label))

Row 172 has been classified as sport and should be  tech
Row 188 has been classified as business and should be  tech
Row 253 has been classified as business and should be  tech


In [39]:
print(df.iloc[7,:])

NewsText    b'Fockers retain film chart crown\n\nComedy Me...
NewsType                                        entertainment
Name: 7, dtype: object


In [43]:
print(le_name_mapping)

{'business': 0, 'entertainment': 1, 'politics': 2, 'sport': 3, 'tech': 4}


In [48]:
def get_dict_value(index_value):
    return list(le_name_mapping.keys())[list(le_name_mapping.values()).index(index_value)]