# Text Classification

In [5]:
import os
import sys
import re
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# %matplotlib inline

In [6]:
path = os.getcwd()

In [7]:
## Download Reuters Data
## https://archive.ics.uci.edu/ml/datasets/Reuters-21578+Text+Categorization+Collection
# !curl -O https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz
# !tar xzvf reuters21578.tar.gz

### Text-Classification using word2vec
 - [Tutorial1](http://nadbordrozd.github.io/blog/2016/05/20/text-classification-with-word2vec/)<br>
 - [Tutorial2](https://datawarrior.wordpress.com/2016/10/12/short-text-categorization-using-deep-neural-networks-and-word-embedding-models/)

### Multi-Label Classification
 - [Sklearn package](https://scikit-learn.org/stable/modules/generated/sklearn.multiclass.OneVsRestClassifier.html)
 - [NLTK Reuters data](https://miguelmalvarez.com/2015/03/20/classifying-reuters-21578-collection-with-python-representing-the-data/)
 - [Kaggle guide](https://www.kaggle.com/reiinakano/basic-nlp-bag-of-words-tf-idf-word2vec-lstm)

# Data Cleaning

In [None]:
# Remove non alpha-numerical chars, lowercase, strip whitespace
for col in target+features:
    df[col] = df[col].replace('[^a-zA-Z]+', ' ', regex=True).str.lower().str.strip()

In [None]:
# merge text data to one field
df['doc'] = df[features].fillna('').apply(lambda x: ' '.join(x), axis=1)
df['doc'].replace('\s+', ' ', regex=True, inplace=True)
df['doc'].replace(' ', np.nan, inplace=True)

# drop short docs
df.loc[(df['errors']!=1) & (df['doc'].str.contains('ransomware')==False) & (df['doc'].str.len()<14), 'errors'] = 1

In [None]:
print('%d %.2f have data' %(len(df), df[target[0]].value_counts().sum() / len(df)))
df[target[0]].value_counts()

# Training, Test and Prediction Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
df.dropna(subset=['doc'], axis=0, inplace=True)

In [None]:
predict = df.loc[df[target[0]].isnull()].copy()
withlabel = df.loc[df[target[0]].notnull()].copy()
len(predict), len(withlabel), len(df)

In [None]:
withlabel['er_breach_type'].value_counts() / len(withlabel)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(withlabel['doc'], withlabel['er_breach_type'],
                                                    test_size=0.5, random_state=9, stratify=withlabel['er_breach_type'])

In [None]:
len(X_train), len(X_test)

# Classification

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
TF = TfidfTransformer()
CV = CountVectorizer()

from sklearn import metrics
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report

# Naive Bayes

In [None]:
# model param
NB_params = {'vect__ngram_range':[(1,1),(1,2)],
             'tfidf__use_idf':('True','False'),
             'model__alpha':(1e-1, 1e-3)}

NB_pipe = Pipeline([('vect', CountVectorizer()),
                    ('tfidf', TfidfTransformer()),
                    ('model', MultinomialNB())])

gs_NB = GridSearchCV(NB_pipe, param_grid=NB_params, n_jobs=2)
gs_NB = gs_NB.fit(X_train, y_train)

In [None]:
print(gs_NB.best_score_, gs_NB.best_params_)
NBcv_predict = gs_NB.predict(X_test)
print('NB-CV accuracy: %.3f' %np.mean(NBcv_predict == y_test))

In [None]:
print(metrics.classification_report(y_test, NBcv_predict))

# SVM

In [None]:
# model param
SVM_params = {'vect__ngram_range':[(1,1),(1,2),(1,3)],
              'tfidf__use_idf':('True','False'),
              'model__alpha':(1e-2, 1e-4),
              'model__penalty': ('l2', 'elasticnet'),
              'model__max_iter': (10, 30, 100)}

SVM_pipe = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model',SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=5, random_state=42)),])

gs_SVM = GridSearchCV(SVM_pipe, param_grid=SVM_params, n_jobs=-1)
gs_SVM = gs_SVM.fit(X_train, y_train)

In [None]:
print(gs_SVM.best_score_, gs_SVM.best_params_)
SVMcv_predict = gs_SVM.predict(X_test)
print('SVM-CV accuracy: %.3f' %np.mean(SVMcv_predict == y_test))

In [None]:
print(metrics.classification_report(y_test, SVMcv_predict))

# Logit

In [None]:
LOG_params = {'vect__ngram_range':[(1,1),(1,2)],
              'tfidf__use_idf':('True','False'),
              'model__alpha':(1e-1, 1e-3)}

LOG_pipe = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model',SGDClassifier(loss='log', penalty='l2',alpha=1e-3, max_iter=5, random_state=42)),])

gs_LOG = GridSearchCV(LOG_pipe, param_grid=LOG_params, n_jobs=-1)
gs_LOG = gs_LOG.fit(X_train, y_train)

In [None]:
print(gs_LOG.best_score_, gs_LOG.best_params_)
LOGcv_predict = gs_LOG.predict(X_test)
print('LOG-CV accuracy: %.3f' %np.mean(LOGcv_predict == y_test))

In [None]:
print(metrics.classification_report(y_test, LOGcv_predict))

# ANN

In [None]:
from sklearn.neural_network import MLPClassifier
ANN_params = {'vect__ngram_range':[(1,1),(1,2)],
              'tfidf__use_idf':('True','False'),
              'model__hidden_layer_sizes':[(3,), (3,3),(3,3,3)],
              'model__alpha':(1e-1, 1e-3)}

ANN_pipe = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('model',MLPClassifier(tol=1e-4, max_iter=200)),])

gs_ANN = GridSearchCV(ANN_pipe, param_grid=ANN_params, n_jobs=-1)
gs_ANN = gs_ANN.fit(X_train, y_train)

In [None]:
print(gs_ANN.best_score_, gs_ANN.best_params_)
ANNcv_predict = gs_ANN.predict(X_test)
print('ANN-CV accuracy: %.3f' %np.mean(ANNcv_predict == y_test))

In [None]:
(0.)

In [None]:
# http://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics
# http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
print(metrics.classification_report(y_test, ANNcv_predict))

# Save Trained Models

In [None]:
from sklearn.externals import joblib
model = r'P:\MyWork\cass-cyber\models\\'
joblib.dump(gs_NB , model+'gs_NB_1.0.pkl') 
joblib.dump(gs_LOG, model+'gs_LOG_1.0.pkl') 
joblib.dump(gs_SVM, model+'gs_SVM_1.0.pkl') 
joblib.dump(gs_ANN, model+'gs_ANN_1.0.pkl') 

# trained-model = joblib.load('filename.pkl') 

# Results

In [None]:
results = pd.DataFrame({'doc':X_test,
                        'target':y_test,
                        'NB':NBcv_predict,
                        'SVM':SVMcv_predict,
                        'LOG':LOGcv_predict,
                        'ANN':ANNcv_predict})

results = results[['doc','target', 'NB', 'SVM', 'LOG', 'ANN']].copy()
results.head()

In [None]:
len(predict), len(withlabel), len(df)

In [None]:
withlabel['breach_method'] = np.nan
withlabel['breach_method'] = 'actual'
withlabel['P-breach_type'] = withlabel['er_breach_type']

predict['breach_method'] = np.nan
predict['breach_method'] = 'model' # doesnt work if doc is null!
predict['P-breach_type'] = gs_SVM.predict(predict['doc'])

out = pd.concat([withlabel, predict])
out = out[['uid', 'breach_method', 'P-breach_type']].copy()
len(out)

In [None]:
out.head()

In [None]:
df = pd.merge(df,out, how='left', left_on='uid', right_on='uid')
df.head()

In [None]:
df.to_csv(path+'data\\interim\\04_breach_prediction.csv', sep='|', encoding='utf-8', index=False)

# Score

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

In [None]:
classes = list(y_test.unique())
classes

In [None]:
Test = label_binarize(y_test, classes=classes)
Test.shape

In [None]:
SVM = label_binarize(SVMcv_predict, classes=classes)
SVM.shape

In [None]:
fig,axs = plt.subplots(1,1, figsize=(7,7))
fpr = dict()
tpr = dict()
roc_auc = dict()

for i,c in enumerate(classes):
    fpr[i], tpr[i], _ = roc_curve(Test[:, i], SVM[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
    axs.plot(fpr[i], tpr[i], label='%s AUC:%.2f'%(c.upper(),roc_auc[i]))
    axs.legend()
    axs.set_xlim(0,1)
    axs.set_ylim(0,1)