In [1]:
!pip install xgboost --quiet
!pip install keras --quiet

In [2]:
import os
import glob
import pandas as pd
import numpy as np
import operator 
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

import gensim
import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from itertools import combinations
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
def check_missing_vals(dframe) -> list:
    return sorted([(c, dframe[c].dtype, dframe[c].isnull().sum()) 
                        for c in dframe.columns if dframe[c].isnull().sum() > 0], 
                      key=lambda x: x[1], reverse=True
                )

def metric_report(yTest, yPred):
    print(f'accuracy: {accuracy_score(yTest, yPred)}')
    print('')
    print("Classification report: \n\n", metrics.classification_report(yTest, yPred))
    print('')
    confusion_matrix = metrics.confusion_matrix(yTest, yPred)
    print("Confusion matrix: \n\n", confusion_matrix)

    
def clean_txt(txtCol: pd.Series) -> pd.Series:
    sw = set(stopwords.words('English'))

    text = txtCol.apply(gensim.utils.simple_preprocess, min_len=3)
    text = text.apply(lambda s: [w for w in s if w not in sw]) 
    text = text.apply(lambda s: [SnowballStemmer("english", ignore_stopwords=True).stem(w) for w in s])
    text = text.apply(lambda s: ['_'.join(x) for x in nltk.bigrams(s)] + s)
    
    return text

In [4]:
def get_data(path, exclude) -> pd.DataFrame:
    """
    get all txt data, put in list of dicts and return a dataframe
    """
    data = []

    for file in os.listdir(path):
        if file not in (exclude):
            full_filename = os.path.join(path, file)
            for news in os.listdir(full_filename):
                with open(os.path.join(full_filename, news), 'rb') as txt_file:
                    data.append({'NewsText': txt_file.read(), 'NewsType': file})

    return pd.DataFrame(data)

import pickle

def picklefy(txtCol: pd.DataFrame) -> pd.DataFrame:
    if [f for f in os.listdir('.') if f.endswith('p')]:
        return pickle.load(open('tfidf.p','rb'))
    return pickle.dump(clean_txt(txtCol['NewsText']), open('tfidf.p', 'wb')) 

In [5]:
# preemtively drop duplicates
df = get_data('bbc/', 'README.TXT').drop_duplicates(); df.head(2)

Unnamed: 0,NewsText,NewsType
0,b'Ad sales boost Time Warner profit\r\n\r\nQua...,business
1,b'Dollar gains on Greenspan speech\r\n\r\nThe ...,business


In [7]:
df.head(2)

Unnamed: 0,NewsText,NewsType
0,b'Ad sales boost Time Warner profit\r\n\r\nQua...,business
1,b'Dollar gains on Greenspan speech\r\n\r\nThe ...,business


In [8]:
# vectorize
tfidf = TfidfVectorizer(sublinear_tf=True, 
                        min_df=5, norm='l2', 
                        encoding='latin-1', 
                        #ngram_range=(1, 2),
                        stop_words='english')

In [14]:
#cleanTxtCol = picklefy(df).apply(lambda x: ' '.join(i for i in x))

features = tfidf.fit_transform(df.NewsText).toarray() 
labels = df.NewsType 

In [16]:
from sklearn.preprocessing import LabelEncoder

LE = LabelEncoder()

X = features
y = LE.fit_transform(labels)

# train/test/split
X_train, X_test, y_train, y_test = train_test_split(
                                            X,y, test_size=0.2
                                    )

In [17]:
# RandomForest
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(max_leaf_nodes=7, n_estimators=50, random_state=1)

In [18]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.8873239436619719

Classification report: 

               precision    recall  f1-score   support

           0       0.82      0.99      0.90       104
           1       0.98      0.72      0.83        78
           2       0.99      0.89      0.94        82
           3       0.82      0.98      0.89        99
           4       0.96      0.78      0.86        63

    accuracy                           0.89       426
   macro avg       0.91      0.87      0.88       426
weighted avg       0.90      0.89      0.89       426


Confusion matrix: 

 [[103   0   0   1   0]
 [  8  56   1  11   2]
 [  8   0  73   1   0]
 [  2   0   0  97   0]
 [  4   1   0   9  49]]


In [22]:
from sklearn.model_selection import GridSearchCV
params = { 
              'criterion':['gini','entropy'],
              #'max_features':['auto', 'sqrt'],
              #'max_depth': np.arange(3,15),
              #'max_leaf_nodes': np.arange(3, 15),
        }
rfc_f = RandomForestClassifier()
gs_rfc_f = GridSearchCV(clf, params , cv=5)
gs_rfc_f.fit(X_train, y_train)
gs_rfc_f.best_params_

{'criterion': 'gini'}

In [24]:
rfc_f = RandomForestClassifier(n_estimators=100,**gs_rfc_f.best_params_)

In [25]:
rfc_f.fit(X_train, y_train)
y_pred = rfc_f.predict(X_test)
metric_report(y_test, y_pred)

accuracy: 0.9507042253521126

Classification report: 

               precision    recall  f1-score   support

           0       0.93      0.96      0.95       104
           1       0.99      0.91      0.95        78
           2       0.92      0.96      0.94        82
           3       0.96      0.98      0.97        99
           4       0.97      0.92      0.94        63

    accuracy                           0.95       426
   macro avg       0.95      0.95      0.95       426
weighted avg       0.95      0.95      0.95       426


Confusion matrix: 

 [[100   0   3   1   0]
 [  2  71   2   2   1]
 [  2   0  79   0   1]
 [  1   0   1  97   0]
 [  2   1   1   1  58]]
