In [None]:
import sys
import os
import numpy as np # linear algebra
import pandas as pd # data processing
import seaborn as sns # statistical data visualization
import matplotlib.pyplot as plt # data visualization
from pandas import Series, DataFrame
import pickle

# Load data (deserialize)
with open('s1_auto_FA_10.pickle', 'rb') as handle:
    s1 = pickle.load(handle)
with open('s1_auto_FA_90.pickle', 'rb') as handle:
    s9 = pickle.load(handle)

In [None]:
norm_fa_10 = s1.div(s1.EX_FA_e, axis=0)
norm_fa_90 = s9.div(s9.EX_FA_e, axis=0)

In [None]:
norm_fa_10_assign = norm_fa_10.assign(EX_FA_e='1')
norm_fa_90_assign = norm_fa_90.assign(EX_FA_e='9')

In [None]:
merge = pd.concat([norm_fa_10_assign, norm_fa_90_assign])
merge.shape

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler

# separate train and test sets

X = merge.drop(columns=["EX_FA_e"], axis = 1) #independent columns
y = merge["EX_FA_e"] #target column

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.33,
    random_state=0,
    stratify=y)

X_train.shape, X_test.shape

In [None]:
scaler = StandardScaler()
scaler.fit_transform(X_train)
scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegressionCV

model = LogisticRegressionCV(cv=10, penalty='l1', solver='saga', random_state=0) #max_iter=4000
model.fit(X_train, y_train)

In [None]:
importance = model.coef_[0]
#print(importance.shape)

In [None]:
L = ['EX_', 'SK_', 'ex2', 'pp2','2pp', '3pp','tpp', 'abcpp', '1p', '2p', 'tu', 'tex', 'NAt', 'HCO3E']

df = pd.DataFrame(zip(X_train.columns, importance))
df_drop = df[~df[0].str.contains('|'.join(L), case=False)]
df_drop

In [None]:
# Store data (serialize)
with open('LR_CV10_auto_fa_df_70.pickle', 'wb') as handle:
    pickle.dump(df, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
df_idx = df_drop.set_index(0)
print(df_idx.shape)
df_idx

In [None]:
# Store data (serialize)
with open('LR_CV10_auto_fa_df_idx.pickle', 'wb') as handle:
    pickle.dump(df_idx, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
fa_20 = df_idx.nlargest(20, columns=1)

# Store data (serialize)
with open('LR_CV10_auto_fa_top_20.pickle', 'wb') as handle:
    pickle.dump(fa_20, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
top_20 = df_idx.nlargest(20, columns=1).plot(kind='barh', legend=None, xlabel = 'Feature importance')
plt.savefig("LR_CV10_auto_fa_top_20" + ".svg", bbox_inches="tight", dpi=300)

In [None]:
y_pred = model.predict(X_test)

from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
# Store data (serialize)
with open('LR_CV10_auto_fa_cnf_matrix.pickle', 'wb') as handle:
    pickle.dump(cnf_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

print('Confusion matrix\n\n', cm)

print('\nTrue Positives(TP) = ', cm[0,0])

print('\nTrue Negatives(TN) = ', cm[1,1])

print('\nFalse Positives(FP) = ', cm[0,1])

print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
# visualize confusion matrix with seaborn heatmap

cm_matrix = pd.DataFrame(data=cm, columns=['Actual Positive', 'Actual Negative'],
                                 index=['Predict Positive', 'Predict Negative'])

cm_matrix_heatmap = sns.heatmap(cm_matrix, annot=True, fmt='d', cmap='YlGnBu')
cm_matrix_heatmap

In [None]:
fig_cm_matrix_heatmap = cm_matrix_heatmap.get_figure()
fig_cm_matrix_heatmap.savefig("LR_CV10_auto_fa_cm" + ".svg", bbox_inches="tight", dpi=300)

In [None]:
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

print('Training accuracy:', model.score(X_train, y_train))

y_pred = model.predict(X_test)

print('Testing accuracy:', metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

In [None]:
class_rep = classification_report(y_pred, y_test)

# Store data (serialize)
with open('LR_CV10_auto_fa_class_rep.pickle', 'wb') as handle:
    pickle.dump(class_rep, handle, protocol=pickle.HIGHEST_PROTOCOL)