In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirpath, dirnames, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirpath, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/learn-ai-bbc/BBC News Train.csv
/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv
/kaggle/input/learn-ai-bbc/BBC News Test.csv


In [2]:
!find /kaggle/input -regex '.*'

/kaggle/input
/kaggle/input/learn-ai-bbc
/kaggle/input/learn-ai-bbc/BBC News Train.csv
/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv
/kaggle/input/learn-ai-bbc/BBC News Test.csv


# Comparing [some] Unsupervised- and Supervised-Classification on BBC News Articles

## Goal
- Answer the question - "Are Matrix Factorization and SVD the same?"
- Use some libraries/frameworks that I am unfamiliar with - Scikit-Learn, NLTK, spaCy.
- Compare at least one algorithm of Unsupervised and Supervised methods each!
- Write a notebook that is concise, contains lots of figures, and allowing others to learn and build-upon.

## Outline
- EDA
- Unsupervised Classification
    - Libraries: sklearn, nltk, spacy
    - Tokenize -> Lemmatized -> Cleaned of extra words (i.e., stop words) -> NMF
    - Just for fun, add PCA and SVD to compare
- Supervised Classification - Include the labels!!!
    - SVM, RF, LogReg, LSTM, Ensembling, NaiveBayes? 

In [3]:
################################################################################
################################################################################
################################################################################

from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin, ClusterMixin
from sklearn.model_selection import train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.svm import SVC
from itertools import permutations
import pandas as pd
import numpy as np
import spacy
import os

## TODO: 
## x Create data reading, merging, and dataframe transforming function
## x Create spaCy transformer class for pipeline
## x Create NMF transformer class for pipeline
## x Create classifying step (permute and find the best match) for the pipeline
## x Add score method to the classifier (will be exposed at the end of the pipeline)
## x Add a todo to further test/explore how the feature_out method/attribute work.

## Outline: 
## - USL
##   - NMF categorization
##     - Pipeline := (extract, transform (TFIDF embedding), NMF (factorization), classifying (permute for best mapping))
##     - Pipeline last component (classifier) implement sklearn.metrics.classification_report and sklearn.metrics.confusion_matrix
##     - Further Test/Explore: ??? How the feature_out thing works.
## - SL
##   - LinearSVC vs SGD(hinge-loss), LogisticRegression vs SGD(log_loss) - Interested in looking at the differences in their optimization methods since SGD is just an optimization method.
##   - Pipeline := (same as prior but replace the last classifier)

## Helper functions
def limitlessPandas(func):
    """My weriding/non-Pythonic way of making Pandas dataframe display more pleasing"""
    def wrapper(stuff):
        with pd.option_context("display.max_colwidth", None, "display.max_rows", None):
            func(stuff)
    return wrapper

## Transformers / Classifiers
class Dataloader():
    """Dataloader for convenience.
    
    This dataloader is for convenience as it encapsulates the reading, merging, 
    and train_test_split() functionality. The main goal is to provide an abstracted 
    interface so that the CSV file is easily workable without having to dive into 
    the structure of the CSV.

    Additionally, has a train_test_split() method to ensure proper independence 
    between training and testing datasets.
            
    Attributes: 
        X (list): List of data to be fet into a pipeline.
        y (list): List of label corresponding to each data.

    Methods: 
        train_test_split(**kwargs): Takes keyword only arguments accepted by 
            sklearn.model_selection.train_test_split()
    """
    
    def __init__(self):
        """Does not need any arguments.
        
        This dataloader is custom to this notebook and dataset for convenience and
        would not work properly anywhere else.
        """
        self._full_dataframe = self._read_csv_and_merge()
        self.X, self.y = self._dataframe_to_X_y(self._full_dataframe)
    
    def _read_csv_and_merge(self, ): 
        """Reads the three CSVs and merge them into one single dataframe."""

        ## Read the CSVs
        df = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Train.csv')
        df_testSet = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Test.csv')
        df_testSetSolution = pd.read_csv('/kaggle/input/learn-ai-bbc/BBC News Sample Solution.csv')

        ## Database like join of the testSet and the testSetSolution df
        selected_columns = ['ArticleId', 'Text', 'Category']
        mergedDf = df_testSet.merge(
            right=df_testSetSolution,
            how='inner',  # Should not have missing labels, if there are inner-join should exclude them.
            on='ArticleId'
        )
        mergedDf = mergedDf[selected_columns]  # Reorder the columns

        ## Concat the training set and the test set
        df = pd.concat([df, mergedDf], axis=0, ignore_index=True)

        return df  # Training set and test set merged dataframe

    def _dataframe_to_X_y(self, dataframe:pd.DataFrame):
        """Helper to convert the dataframe to X and y lists that is expected by the sklearn functions."""
        X = dataframe['Text'].to_list()      # Data to a Python list
        y = dataframe['Category'].to_list()  # Labels to a Python list

        return X, y
    
    def train_test_split(self, **kwargs):
        """Train/test splits the loaded data. Takes keyword only arguments."""
        return train_test_split(self.X, self.y, **kwargs)



class SpaCy_transformer(TransformerMixin, BaseEstimator):
    """Transformer utilizing spaCy's pretrained model.
    
    The goal of this transformer is to also partially reduce the dimensionality 
    of the documents before using a CountVectorizer and TfidfTransformer. The 
    idea behind reducing each word to its lemma is to decrease the number of word
    variants in a document. 
    
    This could be detrimental in NMF clustering accuracy as each document is now
    projected to a lower dimension. 
    
    TODO: 
        - [ ] Add param to choose whether to lemmatize
        
    Attributes: 
    
    Methods: 
    """
    
    def __init__(self, *, n_process=None, ):
        self.n_process = os.cpu_count() if n_process is None else n_process
        
    def fit(self, X, y=None):  # Transformers are transductive and doesn't need a `y`
        return self
        
    def transform(self, X):
        """Filter out, and reduces each word to its lemma (or canonical form)."""
        nlp = spacy.load("en_core_web_sm")  # The trained model used
        doc_generator = nlp.pipe(X, n_process=self.n_process)
        
        doc_list = []
        for doc in doc_generator: 
            lemma_list = []
            for token in doc:
                ## Some exclusion logic
                if token.is_stop | token.is_currency | token.is_space | token.is_punct:
                    # Exclude stop words and currency symbols
                    continue
                if ((token.is_alpha) & (len(token.text)==1)):
                    # Exclude the "s" - Seems like the dataset has already been processed
                    # and resulted in a lot of single alphabet "s" when the original author
                    # intended as possesive "'s"
                    continue
                lemma_list.append(token.lemma_) # Lemmatization
            stringed_lemmas = " ".join(lemma_list)
            doc_list.append(stringed_lemmas)
            
        return doc_list
    
    ## The TransformerMixin defines fit_transform() and delegates to fit() and transform()
    #def fit_transform(self, X, y=None): 
    #    self.fit(X, y)
    #    return self.transform(X)
    



class Classifier_permute_for_best_mapping(ClassifierMixin, BaseEstimator):
    """Pure NMF classifier. Using the featrues extracted with NMF to permute the best label mapping.
    
    This pure NMF classifier is similar to a clustering algorithm that is 
    transductive. The model is not really able to conduct any transfer learning
    as there are not any weights that were learned via training. By limiting the
    factorization to an output dimension of 5, which is matchig the number of 
    categories we have in the dataset, it is similar to doing a 5-NN or PCA of
    top 5 principal components.
    
    The fact that this is transductive makes the design of the methods a little 
    interesting as I have to think about how to implement fit/predict and have 
    tasks delegated to them.
    """
    
    def __init__(self, categories=None, normalize=None): 
        self.categories = ['business', 'tech', 'politics', 'sport', 'entertainment'] if categories==None else categories
        self.normalize = True if normalize==None else normalize
        self._is_predictted = False  # Private attribute; attr postfixed with _ is reserved for checking if check_is_fitted
        #self.best_mapping = dict()
        #self.mapping_permute_results = pd.DataFrame()
        #self.best_accuracy_score = np.float32()
        
    def fit(self, X, y=None):
        self.is_fitted_ = True

    def predict(self, X, y=None, **kwargs): 
        """Given NxK matrix of likelihood and return index of category of max likelihood.
        
        Given a NxK matrix of probability where there are K categories and return 
        array of size N of index of K-categories.
        """
        assert X.ndim == 2, 'The training vector has to have 2-dimensions, thus shape of NxD.'
        y_pred = np.argmax(X, axis=1)
        
        ## Having ran fit() then predict() is the same as running fit_predict()
        self._is_predictted = True
        
        ## Whether to search for the best mapping
        if y==None:  # True y-label not provided
            return y_pred
        if y!=None:  # True y-label provided
            self._find_best_mapping(y_true=y, y_pred=y_pred, **kwargs)
            return y_pred
    
    def fit_predict(self, X, y=None, **kwargs):
        self.fit(X) 
        y_pred = self.predict(X, y)
        
        return y_pred

    def remap_prediction_to_string(self, y_pred):
        """Remap int y_pred categorical label back to string using the best mapping."""
        assert self.is_fitted_ & self._is_predictted, "(`fit()` and `predict()`) or `fit_predict()` has to be called first."
        
        ## Remap y_pred back to string categorical label
        mapping = {value:key for key, value in self.best_mapping.items()}
        remapped_y_pred = list( map(lambda idx: mapping[idx], y_pred) )
        return remapped_y_pred
        
    
    def score(self, X, y, **kwargs):
        if self.is_fitted_ & self._is_predictted: 
            return self.best_accuracy_score
        else:
            self.fit_predict(X, y, **kwargs)
            return self.best_accuracy_score

    
    def _find_best_mapping(self, y_true, y_pred, categories=None, normalize=True, **kwargs):
        """Permute the categorie to find the best mapping."""
        categories = self.categories if categories==None else categories

        perms = permutations(categories, len(categories))
        track_acc_score = []
        track_mapping = []

        for perm in perms: 
            mapping = {key:value for value, key in enumerate(perm)} 
            true_mapped = [mapping[category] for category in y_true]
            acc_score = accuracy_score(y_true=true_mapped, y_pred=y_pred, normalize=normalize)

            track_acc_score.append(acc_score)
            track_mapping.append(mapping)


        result = pd.DataFrame({"Mapping": track_mapping, 
                               "AccuracyScore": track_acc_score})
        result = result.sort_values(by="AccuracyScore", ascending=False)
        result = result.reset_index()
        
        self.mapping_permute_results = result
        self.best_mapping = self.mapping_permute_results.Mapping.iloc[0]
        self.best_accuracy_score = self.mapping_permute_results.AccuracyScore.iloc[0]

        return

In [4]:
################################################################################
### Let's do some NMF Predictions! - Unsupervised
################################################################################

## NOTES: 
##   - Holding out one entry as test set to approximate not doing train/test split.
##   - The reason for not doing a train/test split is because the way we are using NMF and then
##     permutating the y_true for the max accuracy score is essentially like KNN clustering or 
##     PCA dimensionality reduction. They are all a form of ***transductive inferencing***
##     of which there are no generalization based on doing training or "fitting" (sklearn term).


################################################################################
## METHOD 1: Using the pipeline
################################################################################

## Load the data
## NOTE: We are not splitting the train/test data because using NMF this way 
##   is essentially a clustering method, which is tranductive.
dataloader = Dataloader()
X_train, X_test, y_train, y_test = dataloader.train_test_split(test_size=1, random_state=550)  # Essentially not splitting

## Create the pipeline
pipeline = Pipeline([
    #('lemmatizer',      SpaCy_transformer()),  # Disabling lemmatizer - Too slow~~
    ('countVectorizer',  CountVectorizer()),
    ('tfidfTransformer', TfidfTransformer()),
    ('nmf',              NMF(n_components=5)),
    ('classifier',       Classifier_permute_for_best_mapping())
])

## RUN the pipeline
predictions = pipeline.fit_predict(X_train, y_train)

## Get the results
last_component       = pipeline[-1]
permutation_results  = last_component.mapping_permute_results
best_mapping         = last_component.best_mapping
best_accuracy_score  = last_component.best_accuracy_score
remapped_predictions = last_component.remap_prediction_to_string(predictions) # Remap the predictions: integers -> strings

## Print the results
print( f"Best mapping:        {best_mapping}", end='\n'*2 )
print( f"Best accuracy score: {best_accuracy_score}", end='\n'*2 )

print( "NMF Permute and Find best Mapping Results: " )
limitlessPandas(display)(permutation_results.head())

Best mapping:        {'business': 0, 'politics': 1, 'sport': 2, 'tech': 3, 'entertainment': 4}

Best accuracy score: 0.6573741007194245

NMF Permute and Find best Mapping Results: 


Unnamed: 0,index,Mapping,AccuracyScore
0,8,"{'business': 0, 'politics': 1, 'sport': 2, 'tech': 3, 'entertainment': 4}",0.657374
1,22,"{'business': 0, 'entertainment': 1, 'sport': 2, 'tech': 3, 'politics': 4}",0.473471
2,9,"{'business': 0, 'politics': 1, 'sport': 2, 'entertainment': 3, 'tech': 4}",0.465378
3,2,"{'business': 0, 'tech': 1, 'sport': 2, 'politics': 3, 'entertainment': 4}",0.464928
4,50,"{'politics': 0, 'business': 1, 'sport': 2, 'tech': 3, 'entertainment': 4}",0.439299


In [5]:
# ################################################################################
# ## TESTING randomSearchCV
# ################################################################################

# param_grid = {
# #     "countVectorizer__ngram_range": [(1,1), (1, 2), (1, 3), (1, 4), (1, 5)],
#     "tfidfTransformer__norm": ["l1", "l2"],
# #     "tfidfTransformer__use_idf": [True, False],
# #     "tfidfTransformer__sublinear_tf": [True, False],
# #     "nmf__init": ['random', 'nndsvd', 'nndsvda', 'nndsvdar'], 
# #     "nmf__l1_ratio": np.linspace(0, 1, 5)
# }

# randomSearchCV = RandomizedSearchCV(
#     estimator=pipeline, 
#     param_distributions=param_grid, 
#     n_jobs=-1, 
#     error_score='raise', 
#     cv=5, 
#     return_train_score=True)

# randomSearchCV.fit(X_train, y_train)

# randomSearchCV.cv_results_
# randomSearchCV.best_score_
# randomSearchCV.best_params_

In [6]:
################################################################################
## Method 2 - The step-wise functional programming way
################################################################################

## Load the data
## NOTE: We are not splitting the train/test data because using NMF this way 
##   is essentially a clustering method, which is tranductive.
dataloader = Dataloader()
X_train, X_test, y_train, y_test = dataloader.train_test_split(test_size=1, random_state=550)  # Essentially not splitting

## Instantiate pipeline components
lemmatizer       = SpaCy_transformer()  # Disabled lemmatizer - Too slow~
countVectorizer  = CountVectorizer()
tfidfTransformer = TfidfTransformer()
nmf              = NMF(n_components=5)
classifier       = Classifier_permute_for_best_mapping()

## Run through the pipeline step-by-step
temp = countVectorizer.fit_transform(X_train)
temp = tfidfTransformer.fit_transform(temp)
temp = nmf.fit_transform(temp)
predictions = classifier.fit_predict(temp, y_train)

## Get the results
last_component       = classifier
permutation_results  = last_component.mapping_permute_results
best_mapping         = last_component.best_mapping
best_accuracy_score  = last_component.best_accuracy_score
remapped_predictions = last_component.remap_prediction_to_string(predictions) # Remap the predictions: integers -> strings

## Print the results
print( f"Best mapping:        {best_mapping}", end='\n'*2 )
print( f"Best accuracy score: {best_accuracy_score}", end='\n'*2 )

print( "NMF Permute and Find best Mapping Results: " )
limitlessPandas(display)(permutation_results.head())

Best mapping:        {'business': 0, 'politics': 1, 'sport': 2, 'tech': 3, 'entertainment': 4}

Best accuracy score: 0.6573741007194245

NMF Permute and Find best Mapping Results: 


Unnamed: 0,index,Mapping,AccuracyScore
0,8,"{'business': 0, 'politics': 1, 'sport': 2, 'tech': 3, 'entertainment': 4}",0.657374
1,22,"{'business': 0, 'entertainment': 1, 'sport': 2, 'tech': 3, 'politics': 4}",0.473471
2,9,"{'business': 0, 'politics': 1, 'sport': 2, 'entertainment': 3, 'tech': 4}",0.465378
3,2,"{'business': 0, 'tech': 1, 'sport': 2, 'politics': 3, 'entertainment': 4}",0.464928
4,50,"{'politics': 0, 'business': 1, 'sport': 2, 'tech': 3, 'entertainment': 4}",0.439299


In [7]:
################################################################################
## EXPLORE THIS LATER
################################################################################

# class SVC(SVC): 
#     def __init__(self, *args, **kwargs): 
#         super().__init__(*args, **kwargs) # Only need this if I want to init additional things in the subclass
        
#     def fit_predict(self, X, y=None):   # Override the method
#         self.super().fit(X)
#         return self.super().predict(X, y)

In [8]:
################################################################################
## Supervised Learning - Support Vector Classifier (SVC) - type of SVM
################################################################################

## Load the data and train/test split
dataloader = Dataloader()
X_train, X_test, y_train, y_test = dataloader.train_test_split(test_size=1, random_state=550)

## IGNORE - Subsetting dataset for debugging
# end=10
# X_train = X_train[:end]
# X_test = X_test[:end]
# y_train = y_train[:end]
# y_test = y_test[:end]

## Create the pipeline
pipeline = Pipeline([
    #('lemmatizer', SpaCy_transformer()),  # Disabling lemmatizer - Too slow
    ('countVectorizer', CountVectorizer()),    # has fit(), transform(), and fit_transform() - [type of transformer] - vectorizer
    ('tfidfTransformer', TfidfTransformer()),   # has fit(), transform(), and fit_transform() - transformer
    ('nmf', NMF(n_components=5)),  # has fit(), transform() and fit_transform() - transformer
    ('svc', SVC()),
])


## Fit the pipeline
pipeline.fit(X_train, y_train)  # Training set

## Get the results
score = pipeline.score(X_train, y_train)

## Print the results
print( f"Accuracy score: {score}", end='\n'*2 )

Accuracy score: 0.6928956834532374



In [9]:
param_grid = {
    "countVectorizer__stop_words": ['english', None], 
    "countVectorizer__ngram_range": [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5)], 
    "tfidfTransformer__norm": ['l1', 'l2'], 
    "tfidfTransformer__sublinear_tf": [True, False], 
    "nmf__n_components": [5], #np.arange(5, 20, 5, dtype=int),
    "svc__C": np.linspace(0, 50, 50), 
}

searchCV = RandomizedSearchCV(
    estimator=pipeline, 
    param_distributions=param_grid,
    n_iter=10, 
    cv=5,
    verbose=4, 
    return_train_score=True,
    n_jobs=-1,
    error_score='raise'
)

searchCV.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits




In [10]:
pd.DataFrame(searchCV.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_tfidfTransformer__sublinear_tf,param_tfidfTransformer__norm,param_svc__C,param_nmf__n_components,param_countVectorizer__stop_words,param_countVectorizer__ngram_range,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
0,2.608279,0.097339,0.401493,0.027546,False,l2,16.326531,5,english,"(1, 1)",...,0.691085,0.022852,3,0.695334,0.70489,0.70489,0.696459,0.710112,0.702337,0.005605
1,2.517072,0.117545,0.40235,0.013763,True,l1,6.122449,5,english,"(1, 1)",...,0.705475,0.016093,1,0.703204,0.711074,0.71276,0.705453,0.716292,0.709756,0.004794
2,24.926927,0.757961,1.238559,0.177019,False,l2,36.734694,5,,"(1, 2)",...,0.680744,0.022954,4,0.697021,0.712198,0.708263,0.699269,0.710674,0.705485,0.006164
3,41.416993,2.641357,1.319584,0.111116,True,l2,46.938776,5,english,"(1, 4)",...,0.606986,0.046281,7,0.709949,0.717257,0.658797,0.709949,0.658989,0.690988,0.026341
4,40.136922,1.929528,1.241136,0.067119,True,l2,23.469388,5,english,"(1, 4)",...,0.636656,0.059538,6,0.707701,0.714446,0.653176,0.707701,0.660112,0.688627,0.026322
5,42.969335,0.797944,1.84021,0.158186,True,l2,32.653061,5,,"(1, 3)",...,0.680301,0.02305,5,0.706015,0.711074,0.709387,0.708825,0.717416,0.710543,0.003803
6,12.633237,0.74793,0.709599,0.021937,True,l2,17.346939,5,english,"(1, 2)",...,0.701881,0.015437,2,0.707139,0.717257,0.713322,0.706015,0.714607,0.711668,0.004361
7,85.349189,11.090499,2.610479,0.138058,True,l1,43.877551,5,,"(1, 5)",...,0.251784,0.028345,10,0.387296,0.395166,0.395166,0.383924,0.414045,0.395119,0.010438
8,61.225397,13.69822,1.627414,0.063302,True,l1,35.714286,5,,"(1, 3)",...,0.459095,0.02854,9,0.618887,0.603148,0.5638,0.555368,0.62809,0.593859,0.029222
9,48.388607,8.319969,1.499802,0.194526,False,l1,26.530612,5,,"(1, 3)",...,0.51306,0.02973,8,0.605958,0.617201,0.642496,0.647555,0.617978,0.626237,0.015999
