In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split, ShuffleSplit, GridSearchCV
from sklearn.preprocessing import OrdinalEncoder
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import roc_curve, auc
# import warnings 
# warnings.filterwarnings("ignore")

# 1) Load and clean the data

In [None]:
# Load data
# df = pd.read_csv('../input/2014-financial-data/2014_Financial_Data.csv', index_col=0)
df_2014 = pd.read_csv('../input/200-financial-indicators-of-us-stocks-20142018/2014_Financial_Data.csv', index_col=0)
df_2015 = pd.read_csv('../input/200-financial-indicators-of-us-stocks-20142018/2015_Financial_Data.csv', index_col=0)
df_2016 = pd.read_csv('../input/200-financial-indicators-of-us-stocks-20142018/2016_Financial_Data.csv', index_col=0)
df_2017 = pd.read_csv('../input/200-financial-indicators-of-us-stocks-20142018/2017_Financial_Data.csv', index_col=0)
df_2018 = pd.read_csv('../input/200-financial-indicators-of-us-stocks-20142018/2018_Financial_Data.csv', index_col=0)

df = df_2014.append([df_2015,df_2016,df_2017,df_2018])
print(df.shape)

# Drop rows with no information
df.dropna(how='all', inplace=True)

# Drop columns relative to classification
class_data = df.loc[:, ['Class']]
df.drop(['Class', '2015 PRICE VAR [%]'], inplace=True, axis=1)

# Find count and percent of nan-values, zero-values
total_nans = df.isnull().sum().sort_values(ascending=False)
percent_nans = (df.isnull().sum()/df.isnull().count() * 100).sort_values(ascending=False)
total_zeros = df.isin([0]).sum().sort_values(ascending=False)
percent_zeros = (df.isin([0]).sum()/df.isin([0]).count() * 100).sort_values(ascending=False)
df_nans = pd.concat([total_nans, percent_nans], axis=1, keys=['Total NaN', 'Percent NaN'])
df_zeros = pd.concat([total_zeros, percent_zeros], axis=1, keys=['Total Zeros', 'Percent Zeros'])

# Find reasonable threshold for nan-values situation
test_nan_level = 0.5
print(df_nans.quantile(test_nan_level))
_, thresh_nan = df_nans.quantile(test_nan_level)

# Find reasonable threshold for zero-values situation
test_zeros_level = 0.6
print(df_zeros.quantile(test_zeros_level))
_, thresh_zeros = df_zeros.quantile(test_zeros_level)
# Clean dataset applying thresholds for both zero values, nan-values
print(f'INITIAL NUMBER OF VARIABLES: {df.shape[1]}')
print()
print(df.shape)

df_reduce1 = df.drop((df_nans[df_nans['Percent NaN'] > thresh_nan]).index, 1)
print(f'NUMBER OF VARIABLES AFTER NaN THRESHOLD {thresh_nan:.2f}%: {df_reduce1.shape[1]}')
print()
print(df_reduce1.shape)

df_zeros_postnan = df_zeros.drop((df_nans[df_nans['Percent NaN'] > thresh_nan]).index, axis=0)
df_reduce2 = df_reduce1.drop((df_zeros_postnan[df_zeros_postnan['Percent Zeros'] > thresh_zeros]).index, 1)
print(f'NUMBER OF VARIABLES AFTER Zeros THRESHOLD {thresh_zeros:.2f}%: {df_reduce2.shape[1]}')
print(df_reduce2.shape)

# Replace nan-values with mean value of column, considering each sector individually.
df_reduce2 = df_reduce2.groupby(['Sector']).transform(lambda x: x.fillna(x.mean()))
print(df_reduce2.shape)

# Cut outliers
top_quantiles = df_reduce2.quantile(0.97)
outliers_top = (df_reduce2 > top_quantiles)

low_quantiles = df_reduce2.quantile(0.03)
outliers_low = (df_reduce2 < low_quantiles)

df_reduce2 = df_reduce2.mask(outliers_top, top_quantiles, axis=1)
df_reduce2 = df_reduce2.mask(outliers_low, low_quantiles, axis=1)
print(df_reduce2.shape)

df_out = df_reduce2
# Print information about dataset
print("dataset information")
df_out.info()
print("dataset describe")
print(df_out.describe(include = 'all'))
print("dataset shape - now with 62 viable variales")
print(df_out.shape)
print(class_data.shape)


In [None]:
# Plot correlation matrix for winners 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()


import seaborn as sns
df_out['Class'] = class_data
df_winners = df_out[df_out['Class'] == 1]
df_winners = pd.DataFrame(scaler.fit_transform(df_winners.values), columns=df_winners.columns, index=df_winners.index)
print(df_winners.shape)

fig, ax = plt.subplots(figsize=(20,15)) 
sns.heatmap(df_winners.corr(), annot=False,cbar_kws={"shrink": .5},square=True, vmin=-1, vmax=1, center=0, ax=ax)
plt.show()

In [None]:
# Plot correlation matrix for winners 
df_losers = df_out[df_out['Class'] == 0]
print(df_losers.shape)
df_losers = pd.DataFrame(scaler.fit_transform(df_losers.values), columns=df_losers.columns, index=df_losers.index)


fig, ax = plt.subplots(figsize=(20,15)) 
sns.heatmap(df_losers.corr(), annot=False,cbar_kws={"shrink": .5},square=True, vmin=-1, vmax=1, center=0, ax=ax)
plt.show()

# 2) Default MultiLayer Perceptron 

With and without MinMax Scaler

In [None]:
df_out.drop(['Class'], inplace=True, axis=1)

x = df_out
y = class_data

from sklearn.preprocessing import MinMaxScaler


x_train, x_test, y_train, y_test = train_test_split(x, y,test_size=0.2)

mlp = MLPClassifier(max_iter = 10000)

# MinMaxScaler 0 to 1 
print("MinMax Scaler")
scaler = MinMaxScaler().fit(x_train)
x_scaled_train = scaler.fit_transform(x_train)
x_scaled_test = scaler.fit_transform(x_test)

mlp.fit(x_scaled_train,y_train.values.ravel())
y_predict_mlp = mlp.predict(x_scaled_test)

print("Confusion matrix")
print(confusion_matrix(y_test,y_predict_mlp))

print("Report")
print(classification_report(y_test,y_predict_mlp))

# Witout scaler
print("Without scaler")
mlp.fit(x_train,y_train.values.ravel())
y_predict_mlp = mlp.predict(x_test)

print("Confusion matrix")
print(confusion_matrix(y_test,y_predict_mlp))

print("Report")
print(classification_report(y_test,y_predict_mlp))

# 3) Tune Multilayer Perceptron

In [None]:
cv = ShuffleSplit(n_splits=1, test_size=0.2, random_state=1)

param_grid = {
'activation': ['identity','logistic','tanh','relu'],
'solver': ['lbfgs','sgd','adam'],
'learning_rate':['constant','invscaling','adaptive']}

# Gird search corss validation
gridSearch = GridSearchCV(MLPClassifier(max_iter = 6000), param_grid, cv=cv,
                  scoring='accuracy',verbose=2)

gridSearch.fit(x_train, y_train.values.ravel())
print('Score: ', gridSearch.best_score_)
print('Parameters: ', gridSearch.best_params_)

#RESULT
print('')

In [None]:
#tune for random_state
X_train_s, X_tune, y_train_s, y_tune = train_test_split(x_scaled_train, y_train,test_size=0.2)
for i in range(15):
    mlp = MLPClassifier(activation='identity'
                    , solver='adam'
                    , learning_rate='constant'
                    , max_iter = 6000
                    , random_state=i
                   )
    mlp = MLPClassifier(max_iter = 6000,random_state=i)

    mlp.fit(X_train_s, y_train_s.values.ravel())
    y_predict_mlp = mlp.predict(X_tune)
    print(i)
    print(classification_report(y_tune,y_predict_mlp))
    
# Result i = 1



In [None]:
mlp = MLPClassifier(activation='identity'
                    , solver='adam'
                    , learning_rate='constant'
                    , max_iter = 6000
                    , random_state=1
                   )


print("MinMax Scaler")
scaler = MinMaxScaler().fit(x_train)
x_scaled_train = scaler.fit_transform(x_train)
x_scaled_test = scaler.fit_transform(x_test)

mlp.fit(x_scaled_train,y_train.values.ravel())
y_predict_mlp = mlp.predict(x_scaled_test)

print("Confusion matrix")
print(confusion_matrix(y_test,y_predict_mlp))

print("Report")
print(classification_report(y_test,y_predict_mlp))

# 4) K-Nearest Neighbour 

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)

# Train the model using the training sets
knn.fit(x_scaled_train,y_train.values.ravel())

#Predict Output
y_predict_knn = knn.predict(x_scaled_test) 
print("Report")
print(classification_report(y_test,y_predict_knn))