In [14]:
import pandas as pd
import dirty_data as d
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np
import imputation as i
import outlier_detection as od
from utils import encoding_categorical_variables as enc
from sklearn.preprocessing import StandardScaler

In [15]:
df = pd.read_csv("../dataset/pet.csv")
name_class = 'AdoptionLikelihood'

selected_features = ['AgeMonths','Vaccinated','Size', name_class]
selected_features_only = ['AgeMonths','Vaccinated','Size']

df = df[selected_features]
quality = pd.DataFrame([70,75,80,85,90,95])
perc_quality = [70,75,80,85,90,95]

param = {
    'DecisionTree': 160,
    'LogisticRegression': 1,
    'KNN': 18,
    'RandomForest': 150,
    'AdaBoost': 70,
    'SVC': 1
}

In [22]:
def improve_completeness(df, imp_1, imp_2, imp_3, imp_col_1, imp_col_2, imp_col_3, name_class):
    df_clean = df[selected_features_only].copy()

    df_clean = i.impute(df_clean, imp_1, imp_col_1)
    df_clean = i.impute(df_clean, imp_2, imp_col_2)
    df_clean = i.impute(df_clean, imp_3, imp_col_3)

    df_clean[name_class] = df[name_class]

    return df_clean

def improve_accuracy(df, od_1, od_2, imp_1, imp_2, imp_3, imp_col_1, imp_col_2, imp_col_3, name_class):

    df_clean = df.copy()

    df_clean = improve_completeness(df_clean, imp_1, imp_2, imp_3, imp_col_1, imp_col_2, imp_col_3, name_class)

    indexes_1 = od.outliers(df_clean, od_1, selected_features[0])
    indexes_2 = od.outliers(df_clean, od_2, selected_features[1])

    df_clean.loc[indexes_1,selected_features[0]] = np.nan
    df_clean.loc[indexes_2,selected_features[1]] = np.nan

    df_clean[name_class] = df[name_class]

    return df_clean

In [23]:
df_dirt_80 = d.injection(df, name_class, 80/100, 10, 1)
df_dirt_90 = d.injection(df, name_class, 90/100, 10, 1)
df_100 = df

saved dirty dataset 80%
saved dirty dataset 90%


In [24]:
### quality 90%

clf = LogisticRegression(C=param['LogisticRegression'])

X = enc(df_dirt_90[selected_features_only])
X = StandardScaler().fit_transform(X)
X = np.nan_to_num(X)

df_dirt_60 = clf.fit(X, df_dirt_90[name_class])
importances_1 = df_dirt_60.coef_
importances_1

array([[ 0.14718874,  0.56118199, -0.46586538, -0.07682637,  0.42788687,
        -0.73473305,  0.36996351, -0.06499622]])

In [25]:
### quality 100%

clf = LogisticRegression(C=param['LogisticRegression'])

X = enc(df_100[selected_features_only])
X = StandardScaler().fit_transform(X)
X = np.nan_to_num(X)

df_100 = clf.fit(X, df_100[name_class])
importances_2 = df_100.coef_
importances_2

array([[ 0.59949847,  0.56655791, -0.56655791,  0.        ,  0.45854287,
        -0.8258967 ,  0.38821502,  0.        ]])

In [26]:
### quality 80%

clf = LogisticRegression(C=param['LogisticRegression'])

X = enc(df_dirt_80[selected_features_only])
X = StandardScaler().fit_transform(X)
X = np.nan_to_num(X)

df_dirt_60 = clf.fit(X, df_dirt_90[name_class])
importances_3 = df_dirt_60.coef_
importances_3

array([[ 0.12444127,  0.58424918, -0.39468597, -0.13256004,  0.3531291 ,
        -0.70637569,  0.3570243 ,  0.03173223]])

In [27]:
print(importances_1)
print(importances_2)
print(importances_3)


[[ 0.14718874  0.56118199 -0.46586538 -0.07682637  0.42788687 -0.73473305
   0.36996351 -0.06499622]]
[[ 0.59949847  0.56655791 -0.56655791  0.          0.45854287 -0.8258967
   0.38821502  0.        ]]
[[ 0.12444127  0.58424918 -0.39468597 -0.13256004  0.3531291  -0.70637569
   0.3570243   0.03173223]]
