# Data: Preprocessing and Evaluation
## subsection of _Text Classification_

* Data Retrieval
* Data Preprocessing and Normalization
* Building Train and Test Datasets
* Feature Engineering Techniques
    1. Traditional
    2. Advanced
* Classification Models
    1. Multinomial Naive Bayes
    2. Logistic Regression
    3. Support Vector Machines
    4. Ensemble Models
    5. Random Forest
    6. Gradient Boosting Machines
* Evaluating Classification Models
    1. Confusion Matrix

# Data Retrieval

In [1]:
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import text_normalizer as tn
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

data = fetch_20newsgroups(subset='all', shuffle=True, remove=('headers', 'footers', 'quotes'))
data_labels_map = dict(enumerate(data.target_names))

In [None]:
# building the dataframe
corpus, target_labels, target_names = (data.data, data.target, [data_labels_map[label] for label in data.target])
data_df = pd.DataFrame({'Article': corpus, 'Target Label': target_labels, 'Target Name': target_names})
print(data_df.shape)
data_df.head(10)

# Data Preprocessing and Normalization

In [None]:
total_nulls = data_df[data_df.Article.str.strip() == ''].shape[0]
print("Empty documents:", total_nulls)

In [None]:
data_df = data_df[~(data_df.Article.str.strip() == '')]
data_df.shape

In [None]:
stopword_list = nltk.corpus.stopwords.words('english')

# just to keep negation if any in bi-grams
stopword_list.remove('no')
stopword_list.remove('not')

# normalize our corpus
norm_corpus = tn.normalize_corpus(corpus=data_df['Article'], html_stripping=True, contraction_expansion=True, 
                                  accented_char_removal=True, text_lower_case=True, text_lemmatization=True, 
                                  text_stemming=False, special_char_removal=True, remove_digits=True, 
                                  stopword_removal=True, stopwords=stopword_list)

data_df['Clean Article'] = norm_corpus

# view sample data
data_df = data_df[['Article', 'Clean Article', 'Target Label', 'Target Name']]
data_df.head(10)

In [None]:
data_df['Clean Article'] = norm_corpus
data_df = data_df.replace(r'^(\s?)+$', np.nan, regex=True)
data_df.info()

In [None]:
data_df = data_df.dropna().reset_index(drop=True)
data_df.info()

In [None]:
data_df.to_csv('clean_newsgroups.csv', index=False)

In [None]:
data_df = pd.read_csv('clean_newsgroups.csv')

# Building Train and Test Datasets

In [None]:
from sklearn.model_selection import train_test_split

train_corpus, test_corpus, train_label_nums, test_label_nums, train_label_names, test_label_names =\
                                 train_test_split(np.array(data_df['Clean Article']), np.array(data_df['Target Label']),
                                                  np.array(data_df['Target Name']), test_size=0.33, random_state=42)

train_corpus.shape, test_corpus.shape

In [None]:
from collections import Counter

trd = dict(Counter(train_label_names))
tsd = dict(Counter(test_label_names))

(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], 
             columns=['Target Label', 'Train Count', 'Test Count'])
.sort_values(by=['Train Count', 'Test Count'],
             ascending=False))

# Feature Engineering Techniques

_go over the following_

    1. Traditional Feature Engineering Models
    2. Advanced Feature Engineering models

# Classification Models

_go over following models_

    1. Multinomial Naive Bayes
    2. Logistic Regression
    3. Support Vector Machines
    4. Ensemble Models
    5. Random Forest
    6. Gradient Boosting Machines

# Evaluating Classification Models

## Confusion Matrix

In [None]:
from sklearn import linear_model
# train and build model
logistic = linear_model.LogisticRegression()
logistic.fit(X_train, y_train)

# predict on test data and view confusion matrix
import model_evaluation_utils as meu

y_pred = logistic.predict(X_test)
meu.display_confusion_matrix(true_labels=y_test, predicted_labels=y_pred, classes=[0,1])

## Performance Metrics

In [None]:
# compute accuracy on model predictions
fw_acc = round(meu.metrics.accuracy_score(y_true=y_test, y_pred=y_pred), 5)
mc_acc = round((TP+TN)/(TP+TN+FP+FN),5)
print('Framework Accuracy:', fw_acc)
print('Manually Computed Accuracy:', mc_acc)

In [None]:
# compute precision on model predictions
fw_prec = round(meu.metrics.precision_score(y_true=y_test, y_pred=y_pred), 5)
mc_prec = round((TP)/(TP+FP),5)
print('Framework Precision:', fw_prec)
print('Manually Computed Precision:', mc_prec)

In [None]:
# compute f1-score on model predictions
fw_f1 = round(meu.metrics.f1_score(y_true=y_test, y_pred=y_pred), 5)
mc_f1 = round((2*mc_prec*mc_rec)/(mc_prec+mc_rec), 5)
print('Framework F1-Score:', fw_f1)
print('Manually Computed F1-Score:', mc_f1)