# Read In Dataset ([RCV1](http://scikit-learn.org/stable/datasets/rcv1.html))
The RCV1 dataset is included in scikit learn by default.

In [1]:
""" Trace Cell """
import time

trace = True

def ex_time( prev_time , message ):
    print(f'Time to {message}: {time.time() - prev_time:0.4f} seconds')

In [2]:
""" Get Dataset Cell

    Dependencies:
        • Trace Cell """

from sklearn.datasets import fetch_rcv1

def get_rcv1(): 
    last_time = time.time()

    # Retrieve the dataset
    rcv1 = fetch_rcv1()
    ex_time(last_time, 'fetch')
    return rcv1

rcv1 = get_rcv1()

Time to fetch: 4.3509 seconds


## Dataset Attributes
   1. __data__ - a scipy [compressed row storage (CSR) sparce matrix](https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.sparse.csr_matrix.html). Non-zero values are cosinne-normalized, log TF-IDF vectors. The shape is (num_samples, num_features)
   2. __target__ - a scipy CSR sparse matrix. Maps each sample to relavent categories (sometimes refered to as labels). The shape is (num_samples, num_categories).
   3. __sample_id__ - a [numpy n-dimensional array](https://docs.scipy.org/doc/numpy-1.14.0/reference/generated/numpy.ndarray.html) (ndarray) associating each sample's ID to its sample number
   4. __target_names__ - a ndarray of target names (can be thought of as topics or categories) corresponding to the category mapping in __(2)__. Each sample belongs to n categories, where 1 $\le$ n $\le$ 13.
   5. __description__ - a description of the dataset
   
The shape attribute gives the dimensions of the matrix.

In [3]:
""" RCV1 Attributes Cell

    Dependencies:
        • Get Dataset Cell
        • Trace Cell """

def get_nonzero(matrix):
    last_time = time.time()
    
    nonzero = (matrix.count_nonzero() / (matrix.shape[0] * matrix.shape[1])) * 100
    # Check value is valid
    assert nonzero <= 100
    assert nonzero >= 0
    
    last_time = ex_time(last_time, 'calc nonzero')
    return nonzero

def print_rcv1_attributes():
    print ('data:')
    print(f'      shape: {rcv1.data.shape}')
    print(f'  data type: {rcv1.data.dtype}')
    print(f' array type: {type(rcv1.data)}')
    print(f'    nonzero: {get_nonzero(rcv1.data):0.4f}%\n')

    print('target:')
    print(f'      shape: {rcv1.target.shape}')
    print(f'  data type: {rcv1.target.dtype}')
    print(f' array type: {type(rcv1.target)}')
    print(f'    nonzero: {get_nonzero(rcv1.target):0.4f}%\n')

    print('sample_id:')
    print(f'      shape: {rcv1.sample_id.shape}')
    print(f'  data type: {type(rcv1.sample_id[3])}')
    print(f' array type: {type(rcv1.sample_id)}\n')

    print('target_names:')
    print(f'      shape: {rcv1.target_names.shape}')
    print(f'  data type: {type(rcv1.target_names[3])}')
    print(f' array type: {type(rcv1.target_names)}')

print_rcv1_attributes()

data:
      shape: (804414, 47236)
  data type: float64
 array type: <class 'scipy.sparse.csr.csr_matrix'>
Time to calc nonzero: 0.0107 seconds
    nonzero: 3.1463%

target:
      shape: (804414, 103)
  data type: uint8
 array type: <class 'scipy.sparse.csr.csr_matrix'>
Time to calc nonzero: 0.0049 seconds
    nonzero: 3.1463%

sample_id:
      shape: (804414,)
  data type: <class 'numpy.uint32'>
 array type: <class 'numpy.ndarray'>

target_names:
      shape: (103,)
  data type: <class 'str'>
 array type: <class 'numpy.ndarray'>


# Document Classification
## [Naive Bayes](http://scikit-learn.org/stable/modules/naive_bayes.html)
A supervise learning algorithm based on using Bayes' theorem. This method assumes independence between each pair of features.

Bayes Theorem:
\begin{equation}
P(y | x_1, . . ., x_n) = \frac{P(y) P(x_1, . . ., x_n | y)} {P(x_1, . . ., x_n)} \hspace{3.6cm} (1)
\end{equation}

This can be further simplified by using the assumption that each pair of features is independent, shown in eq 2.1. Then since $P(x_1, ..., x_n) $ is constant with respect to each input we can use the proportionlity in eq 2.2.

\begin{align}
P(y | x_1, . . ., x_n) &= \frac{P(y) \prod_{i = 1}^{n} P(x_1, . . ., x_n | y)} {P(x_1, . . ., x_n)} \hspace{3cm}(2.1)\\
                 &\propto P(y) \prod_{i = 1}^{n} P(x_1, . . ., x_n | y)   \hspace{3.5cm}(2.2)
\end{align}

This gives us our classification rule, shown in eq 3.

\begin{equation}
\hat{y} = arg \max_y P(y) \prod_{i=1}^{n} P(x_i | y) \hspace{5.4cm} (3)
\end{equation}

SciKit Learn supports several Naive Bayes implimentations. 
1. __Gaussian Naive Bayes__ - In this implementation the likelihood of the features is assumed to be Gaussian. Gussian distributions are more commonly reffered to as the normal distribution or bell curve. This class does not support sparse matricies.
2. __Multinomial Naive Bayes__ - In this implementation the likelihood of the features is assumed to follow a multinomial distribution. Typically used in text classification. This class does support sparse matricies.
3. __Bernoulli Naive Bayes__ - This implimentaiton assumes the data follows multivariate Bernoulli distributions. Multiple features are allowed but each one is assumed to be a binary variable.

Because of these restrictions only Multinomial Naive Bayes will be appropriate for the RCV1 dataset. There is an additional step required because Naive Bayes does not typically predict multilple labels. This means an additional strategy must be implemented. One commonly used startegy is known as one-vs-all ([one-vs-the-rest](http://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html#sklearn.multiclass.OneVsRestClassifier) in scikit learn). This strategy works by using one classifier for each label.

In [4]:
""" Fit Classifier Cell 

    Dependencies:
        • Trace Cell """

from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier

def make_fit_NB(data, target, num_jobs=-1):
    last_time = time.time()
    
    # Initialize classifier
    classifier = OneVsRestClassifier(MultinomialNB(), n_jobs=num_jobs)

    # Train classifier
    classifier.fit(data, target)
    ex_time(last_time, 'train')
    return classifier

In [None]:
### Dependent on make_fit_clf and get_rcv1
mult_NB_clf = make_fit_NB(rcv1.data, rcv1.target)

## Testing
When testing a machine learning algorithm it is difficult to determine how a classifier performs by hand, so several metrics are commonly used to evaluate their performance.

1. __accuracy__ - The percentage of labels predicted correctly.
+ __precision__ - The ratio of true positives to true positives and false positives.
+ __recall__ - The ratio of true positives to true positives and false negatives.
+ __average precision__ - The weighted mean of the precision achieved at eaach recall threshold.

The term 'micro' average reffers to calculating the average of each label weighted with respect to how frequently they occur. This is most appropriate for the RCV1 dataset because the labels do not occur with the same frequency. Information on additional averaging methods can be found [here](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score). 

In [5]:
""" Scoring Methods Cell
    
    Dependencies: 
        • Trace Cell """

# For scoring methods
from sklearn.metrics import accuracy_score, average_precision_score, f1_score, precision_score, recall_score

def predict_clf(clf, data):
    last_time = time.time()

    # Generate prediction on test data
    prediction = clf.predict(data)
    ex_time(last_time, 'predict')

    return prediction


def get_scores(target, prediction):
    scores = [accuracy_score(target, prediction)]
    scores += [f1_score(target, prediction, average='micro')]
    scores += [average_precision_score(target.toarray(), prediction.toarray(), average="micro")]
    scores += [recall_score(target, prediction, average='micro')]
    scores += [precision_score(target, prediction, average='micro' )]
    return scores

def print_scores(scores):
    print(f'Accuracy: {scores[0]:0.4f}')
    print(f'Micro-averaged F1: {scores[1]:0.4f}')
    print(f'Average precision score, micro-averaged over all classes: {scores[2]:0.4f}')
    print(f'Micro averaged recall: {scores[3]:0.4f}')
    print(f'Micro averaged recall: {scores[4]:0.4f}')

## [K-Folds Cross-Validation](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html)

To remove bias from a train/test split it is common practice to use K-Folds cross-validation, typically k = 10. This splits the dataset into k segments. With these k segments k - 1 are used to train a classifier while the last one is used to test. This is repeated until all k segments have been used as the testing segment. 

In [7]:
""" kFold Cell
    
    Dependencies:
        • Scoring Methods Cell 
        • Trace Cell """

# For train/test split
from sklearn.model_selection import KFold

def k_fold_and_score_NB(data, target, k):
    # Initialize train/test split
    kf = KFold(n_splits=k)

    total_scores = [0] * 5
    # Iterate over each train/test split
    for train_index, test_index in kf.split(rcv1.data):
        X_train = rcv1.data[train_index]
        X_test = rcv1.data[test_index]
        y_train = rcv1.target[train_index]
        y_test = rcv1.target[test_index]

        # Train classifier
        split_clf = make_fit_NB(X_train, y_train)

        # Generate prediction on test data
        prediction = predict_clf(split_clf, X_test)

        scores = get_scores(y_test, prediction)
        total_scores = list(map(sum, zip(scores, total_scores)))

    avg_scores = list(map(lambda x: x / k, total_scores))
    for score in avg_score:
        assert score >= 0.0
        assert score <= 1.0

    print_scores(avg_scores)
    return avg_scores

In [None]:
k_fold_and_score_NB(rcv1.data, rcv1.target, 10)

# Notes for tomorrow

## SciKit

+ [multiclass and multilabel algorithms](http://scikit-learn.org/stable/modules/multiclass.html)
+ [precision-recall](http://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html#sphx-glr-auto-examples-model-selection-plot-precision-recall-py)
+ [average_precision_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.average_precision_score.html#sklearn.metrics.average_precision_score)
+ [precision_recall_curve](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html#sklearn.metrics.precision_recall_curve)
+ [accuracy_score](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score)

## Markdown

+ [math](http://jupyter-notebook.readthedocs.io/en/stable/examples/Notebook/Typesetting%20Equations.html)
+ [general](https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet)


## [Support Vector Machines (SVM)](http://scikit-learn.org/stable/modules/svm.html)

A supervised learning that works by seperating the samples into categories seperated by a hyperplane, or set of hyperplanes. Since the hyperplane that that seperates them is not typically unique SVM finds the hyperplane with the maximum distance from any of the categories.

SVM use kernel functions to compute the similarity between data points. Some of the most common are:

1. linear
+ polynomial
+ radial basis function (rbf)
+ sigmoid

The kernel used can significantly impact the accuracy of a classifier. More information on kernels is provided [here](http://scikit-learn.org/stable/modules/metrics.html)

Additional source on [SVM](http://dx.doi.org/10.1038/nbt1206-1565 )

In [11]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier

def make_fit_SVM(data, target, num_jobs=-1):
    last_time = time.time()
    
    # Initialize classifier
    classifier = OneVsRestClassifier(SVC())

    # Train classifier
    classifier.fit(data, target)
    ex_time(last_time, 'train')
    return classifier

In [17]:
def k_fold_and_score_SVM(data, target, k):
    # Initialize train/test split
    kf = KFold(n_splits=k)

    total_scores = [0] * 5
    # Iterate over each train/test split
    for train_index, test_index in kf.split(rcv1.data):
        X_train = rcv1.data[train_index]
        X_test = rcv1.data[test_index]
        y_train = rcv1.target[train_index]
        y_test = rcv1.target[test_index]

        # Train classifier
        basic_SVM = make_fit_SVM(X_train, y_train)

        # Generate prediction on test data
        prediction = predict_clf(split_clf, X_test)

        scores = get_scores(y_test, prediction)
        total_scores = list(map(sum, zip(scores, total_scores)))
        break

    avg_scores = list(map(lambda x: x / k, total_scores))
    for score in avg_score:
        assert score >= 0.0
        assert score <= 1.0

    print_scores(avg_scores)
    return avg_scores

In [None]:
k_fold_and_score_SVM(rcv1.data, rcv1.target, 10)