In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from string import digits
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.externals import joblib
import shutil
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

In [None]:
# Set properties
N_FEATURES = 1024 * 1
HIDDEN_LAYER_SIZES = 40
MAX_ITER = 300

In [None]:
# Import CSV
data_set = pd.read_csv("data/data_sample.csv", delimiter=",")
data_set.info()

In [None]:
data_set.head(10)

In [None]:
data_set.describe()

In [None]:
# Remove duplicates
data_set = data_set.drop_duplicates()
data_set.info()

In [None]:
data_set.describe()

In [None]:
# Split data to features (X) & labels (y)
X = data_set.drop(['trade'], axis=1)
y = data_set['trade']

In [None]:
X.head(5)

In [None]:
y.head(5)

In [None]:
# Split to train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=12345)

In [None]:
X_train.describe()

In [None]:
X_test.describe()

In [None]:
X_train_array = X_train.get_values()


In [None]:
X_train_array[17]

In [None]:
X_train_array[17][0]

In [None]:
X_train_array = X_train_array.flatten()
X_train_array[17]

In [None]:
X_test_array = X_test.get_values().flatten()

In [None]:
# Normalize case to lowercase
X_train_array = [x.lower() for x in X_train_array]
X_test_array = [x.lower() for x in X_test_array]
X_train_array[17]

In [None]:
# Remove digits

remove_digits = str.maketrans('', '', digits)
X_train_array = [x.translate(remove_digits) for x in X_train_array]
X_test_array = [x.translate(remove_digits) for x in X_test_array]
    
X_train_array[17]   

In [None]:
# Remove special characters

X_train_array = [''.join(
    word for word in name
    if word.isalnum()
    or word == ' '
    or word not in ['[', ']', '/', ',', '(', '%', ')', ':', '+', '!', '.', '_', '?', '"', '*', "'", ';']
)
    for name
    in X_train_array]

X_test_array = [''.join(
    word for word in name
    if word.isalnum()
    or word == ' '
    or word not in ['[', ']', '/', ',', '(', '%', ')', ':', '+', '!', '.', '_', '?', '"', '*', "'", ';']
)
    for name
    in X_test_array]
X_train_array[17]   

In [None]:
# Replace '-' to space

X_train_array = [name.replace('-', ' ') for name in X_train_array]
X_test_array = [name.replace('-', ' ') for name in X_test_array]
X_train_array[17]      

In [None]:
# Feature hashing
hv = HashingVectorizer(n_features=N_FEATURES, alternate_sign = False, norm=None)
X_train_after_hashing = hv.transform(X_train_array)
X_test_after_hashing = hv.transform(X_test_array)

In [None]:
print(X_train_after_hashing.shape) 

In [None]:
print(X_train_after_hashing[10, :])

In [None]:
algorithm = MLPClassifier(hidden_layer_sizes=HIDDEN_LAYER_SIZES, max_iter=MAX_ITER)

In [None]:
algorithm.fit(X_train_after_hashing, y_train.values)

In [None]:
# max 2624

idx = 543
print(algorithm.predict(X_test_after_hashing[idx]))
print(y_test.values[idx])

In [None]:
algorithm.predict_proba((X_test_after_hashing[idx])[0])

In [None]:
score = algorithm.score(X_test_after_hashing, y_test, sample_weight=None)
score

In [None]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (20,14), fontsize=14):
  
  confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)[:, np.newaxis]
  df_cm = pd.DataFrame( confusion_matrix, index=class_names, columns=class_names )
  fig = plt.figure(figsize=figsize)
  try:
    heatmap = sns.heatmap(df_cm, annot=True, fmt=".3f")
  except ValueError:
    raise ValueError("Confusion matrix values must be integers.")
  heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
  heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
  plt.ylabel('True label')
  plt.xlabel('Predicted label')


In [None]:
predictions = algorithm.predict(X_test_after_hashing)
labels = y_test.unique()

c = confusion_matrix(y_test, predictions, labels)

In [None]:
print_confusion_matrix(c, class_names=labels)

In [None]:
print(data_set[data_set.trade==16])