In [1]:
import pandas as pd
import dirty_data as d
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import imputation as i
import outlier_detection as od
from utils import encoding_categorical_variables as enc
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("../dataset/heart.csv")
name_class = 'disease'

selected_features = ['oldpeak','cp','thal', name_class]
selected_features_only = ['oldpeak','cp','thal']
df = df[selected_features]
quality = pd.DataFrame([50,60,70,80,90])
perc_quality = [50,60,70,80,90]

param = {
    'DecisionTree': 70,
    'LogisticRegression': 1,
    'KNN': 8,
    'RandomForest': 70,
    'AdaBoost': 90,
    'SVC': 1
}

In [4]:
def improve_completeness(df, imp_1, imp_2, imp_3, imp_col_1, imp_col_2, imp_col_3, name_class):
    df_clean = df[selected_features_only].copy()

    df_clean = i.impute(df_clean, imp_1, imp_col_1)
    df_clean = i.impute(df_clean, imp_2, imp_col_2)
    df_clean = i.impute(df_clean, imp_3, imp_col_3)

    df_clean[name_class] = df[name_class]

    return df_clean

def improve_accuracy(df, od_1, od_2, imp_1, imp_2, imp_3, imp_col_1, imp_col_2, imp_col_3, name_class):

    df_clean = df.copy()

    indexes_1 = od.outliers(df_clean, od_1, selected_features[0])
    indexes_2 = od.outliers(df_clean, od_2, selected_features[1])

    df_clean.loc[indexes_1,selected_features[0]] = np.nan
    df_clean.loc[indexes_2,selected_features[1]] = np.nan

    df_clean[name_class] = df[name_class]

    df_clean = improve_completeness(df_clean, imp_1, imp_2, imp_3, imp_col_1, imp_col_2, imp_col_3, name_class)

    return df_clean

In [5]:
df_dirt_50 = d.injection(df, name_class, 50/100, 10, 1)
df_dirt_90 = d.injection(df, name_class, 90/100, 10, 1)
df_100 = df

saved dirty dataset 50%
saved dirty dataset 90%


In [6]:
### quality 90%

clf = LogisticRegression(C=param['LogisticRegression'])

X = enc(df_dirt_90[selected_features_only])
X = StandardScaler().fit_transform(X)
X = np.nan_to_num(X)

df_dirt_60 = clf.fit(X, df_dirt_90[name_class])
importances_1 = df_dirt_60.coef_
importances_1

array([[-0.4685333 ,  0.15027832,  0.30213017,  0.29797536, -0.59553746,
         0.05123067,  0.55093318, -0.00208084, -0.10118001, -0.48928184,
        -0.06433745]])

In [7]:
### quality 100%

clf = LogisticRegression(C=param['LogisticRegression'])

X = enc(df_100[selected_features_only])
X = StandardScaler().fit_transform(X)
X = np.nan_to_num(X)

df_100 = clf.fit(X, df_100[name_class])
importances_2 = df_100.coef_
importances_2

array([[-0.8662592 ,  0.23369883,  0.13871986,  0.39692036, -0.58685025,
         0.        ,  0.49274411, -0.03789612, -0.06492551, -0.46589098,
         0.        ]])

In [8]:
### quality 50%

clf = LogisticRegression(C=param['LogisticRegression'])

X = enc(df_dirt_50[selected_features_only])
X = StandardScaler().fit_transform(X)
X = np.nan_to_num(X)

df_dirt_60 = clf.fit(X, df_dirt_90[name_class])
importances_3 = df_dirt_60.coef_
importances_3

array([[ 0.07154123,  0.09746743,  0.40671467,  0.40329425, -0.58627628,
        -0.02660781,  0.45011855, -0.01448909,  0.07789161, -0.49134566,
        -0.00935601]])

In [9]:
print(importances_1)
print(importances_2)
print(importances_3)


[[-0.4685333   0.15027832  0.30213017  0.29797536 -0.59553746  0.05123067
   0.55093318 -0.00208084 -0.10118001 -0.48928184 -0.06433745]]
[[-0.8662592   0.23369883  0.13871986  0.39692036 -0.58685025  0.
   0.49274411 -0.03789612 -0.06492551 -0.46589098  0.        ]]
[[ 0.07154123  0.09746743  0.40671467  0.40329425 -0.58627628 -0.02660781
   0.45011855 -0.01448909  0.07789161 -0.49134566 -0.00935601]]
