## Tax Risk

In [2]:
from __future__ import print_function

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from sklearn import metrics

import pandas as pd
import numpy as np

In [108]:
# Embedding
max_features = 1000
maxlen = 500
embedding_size = 64

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 30
epochs = 1

### Read data

In [125]:
# train_df = pd.read_csv('private/sorted_train.csv')
# test_df = pd.read_csv('private/sorted_test.csv')
df = pd.read_csv('private/tax_risk_sorted_and_merged.csv')
df.head()

Unnamed: 0,text,company,year,file_name,dummy1,dummy2,dummy3
0,- creasing freight rates. The purchase o...,a.p. moller-maersk,2007,A.P. Moller-Maersk_2007.TXT,1,0,0
1,"delivery from banks, export credit organisat...",a.p. moller-maersk,2008,A.P. Moller-Maersk_2008.TXT,1,0,0
2,\x92s envi- ronmental impact and its social r...,a.p. moller-maersk,2009,A.P. Moller-Maersk_2009.TXT,1,1,1
3,global human rights standards associated wi...,a.p. moller-maersk,2011,A.P. Moller-Maersk_2011.TXT,1,1,1
4,Human rights It is a priority for the Group ...,a.p. moller-maersk,2012,A.P. Moller-Maersk_2012.TXT,1,1,1


In [127]:
df = df.reindex(np.random.permutation(df.index))
df.head()

Unnamed: 0,text,company,year,file_name,dummy1,dummy2,dummy3
1552,economic cycle. The discount factor is dete...,fresenius,2011,Fresenius_2011(2).TXT,0,0,0
1566,"circumstances, there are adequate liquid re...",fresnillo,2012,Fresnillo_2012.TXT,1,1,0
2691,the estimated weighted average cost of capit...,merlin entertainments,2009,Merlin Entertainments_2009.TXT,0,0,0
4533,subsequent planning years. Cash flows beyon...,vopak,2012,Vopak_2012.TXT,0,0,0
1398,"were to leave, Eurofins may have difficulty ...",eurofins scientific,2014,Eurofins Scientific_2014.TXT,1,1,0


In [128]:
offset = int(len(df) * 0.75)
train_df = df[:offset]
test_df = df[offset:]
print("Training set size: {}".format(len(train_df)))
print("Test set size: {}".format(len(test_df)))

Training set size: 3506
Test set size: 1169


In [130]:
print("Training set: ")
print("Negative class size: {}".format(len(train_df[(train_df['dummy3'] == 0)])))
print("Positive class size: {}".format(len(train_df[(train_df['dummy3'] == 1)])))

Training set: 
Negative class size: 3064
Positive class size: 442


In [131]:
new_train = train_df[(train_df['dummy3'] == 0)][:473].append(train_df[(train_df['dummy3'] == 1)], ignore_index=True)

In [133]:
new_train.head()
train_df = new_train.copy()
print("Training set: ")
print("Negative class size: {}".format(len(train_df[(train_df['dummy3'] == 0)])))
print("Positive class size: {}".format(len(train_df[(train_df['dummy3'] == 1)])))

Training set: 
Negative class size: 473
Positive class size: 442


In [134]:
print("Training set: ")
print("Negative class size: {}".format(len(train_df[(train_df['dummy3'] == 0)])))
print("Positive class size: {}".format(len(train_df[(train_df['dummy3'] == 1)])))

Training set: 
Negative class size: 473
Positive class size: 442


In [135]:
print("Test set: ")
print("Negative class size: {}".format(len(test_df[(test_df['dummy3'] == 0)])))
print("Positive class size: {}".format(len(test_df[(test_df['dummy3'] == 1)])))

Test set: 
Negative class size: 1014
Positive class size: 155


In [136]:
tok = Tokenizer(num_words=max_features, split=' ')
tok.fit_on_texts(train_df['text'].values)
X_train = tok.texts_to_sequences(train_df['text'].values)
Y_train = pd.get_dummies(train_df['dummy3']).values

X_test = tok.texts_to_sequences(test_df['text'].values)
Y_test = pd.get_dummies(test_df['dummy3']).values

X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)

### Build a model

In [None]:
model = Sequential()
model.add(Embedding(max_features, embedding_size, 
                    input_length=maxlen))
model.add(Dropout(0.25))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dense(2))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print("Model summary: ")
print(model.summary())

print('Train...')
model.fit(X_train, Y_train,
          batch_size=batch_size,
          epochs=epochs)

score, acc = model.evaluate(X_train, Y_train, batch_size=batch_size)
print("Score on train set: ", score)
print("Accuracy on train set: ", acc)

score, acc = model.evaluate(X_test, Y_test, batch_size=batch_size)
print("Score on test set: ", score)
print("Accuracy on test set: ", acc)

In [116]:
Y_pred = model.predict(X_test)
Y_pred = (Y_pred > 0.5).astype(int)

In [117]:
tp = 0
tn = 0
fp = 0
fn = 0

# [0, 1] positive, i.e, tax risk exists
# [1, 0] negative, no tax risk
for i in range(len(Y_test)):
    # positive
    if np.argmax(Y_test[i]) == 1:
        # positive
        if np.argmax(Y_pred[i]) == 1:
            tp += 1
        # negative
        else:
            fn += 1
    # negative        
    else:
        # negative
        if np.argmax(Y_pred[i] == 0):
            tn +=1
        # positive
        else:
            fp += 1
print("True positive: ", tp)
print("False positive: ", fp)
print("True negative: ", tn)
print("False negative: ", fn)
cm = [
    [tn, fp],
    [fn, tp]
]
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1_score = 2 * precision * recall / (precision + recall)
accuracy = (tp + tn) / (tp + tn + fp + fn)
print("Precision: %.4lf"%precision)
print("Recall: %.4lf"%recall)
print("F1-score: %.4lf"%f1_score)
print("Accuracy: %.4lf"%accuracy)

True positive:  0
False positive:  0
True negative:  1045
False negative:  124


ZeroDivisionError: division by zero

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
def plot_confusion_matrix(cm, classes):
    """Plot confusion matrix from metrics.confusion_matrix numpy.ndarray
    
    Args:
        cm: numpy.ndarray confusion matrix which is returned from metrics.confusion matrix
        classes: list of class names
    """
    if len(classes) == 2:
        # the count of true negatives is C_{0,0}, 
        # false negatives is C_{1,0}, 
        # true positives is C_{1,1}, 
        # false positives is C_{0,1}.
        classes = [classes[0]+": (- class)", classes[1]+": (+ class)"]
        
    cm_df = pd.DataFrame(cm, index=classes, columns=classes)
    fig = plt.figure()
    heatmap = sns.heatmap(cm_df, annot=True, fmt="d", cmap="Blues")
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels())
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels())
    plt.xlabel("PREDICTED LABEL")
    plt.ylabel("TRUE LABEL")

def get_scores(cm):
    """Return precision, recall, accuracy, f1-score from confusion matrix.
       the count of true negatives is cm[0,0]
       the count of false negatives is cm[1, 0]
       the count of true positives is cm[1, 1]
       the count of false positives is cm[0, 1]
    
    Args:
        cm: numpy.ndarray confusion matrix which is returned from metrics.confusion matrix
    
    Returns:
        scores: dictionary of scores
    """
    tn, fp, fn, tp = cm.ravel()
    
    scores = {}
    scores['precision'] = tp/(tp + fp)
    scores['recall'] = tp/(tp + fn)
    scores['accuracy'] = (tp + tn)/(tp + tn + fp + fn)
    scores['f1'] = 2*scores['precision']*scores['recall']/(scores['precision'] + scores['recall'])
    
    return scores    

In [None]:
plot_confusion_matrix(cm, ['0', '1'])