In [None]:
# SOLUTION:

# Solution for correcting data quality issues
# Debug data first!

# In this dataset, we have lot of observations that have missing values
# Missing values are represented using 0s
# We need to impute values; one option is to find out mean for every class and use that as a substitute 
# for missing values
# With these changes,the model F1 score improves from 0.65 to 0.81

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

<h2>Diabetes Binary Classification Dataset</h2>

Input Features:
'preg_count', 'glucose_concentration', 'diastolic_bp', 'triceps_skin_fold_thickness', two_hr_serum_insulin', 'bmi', 'diabetes_pedi', 'age' <br>

Target Feature: 
'diabetes_class'<br>

Objective: Predict diabetes_class for given input features<br>

<h4>Data Source: https://archive.ics.uci.edu/ml/datasets/pima+indians+diabetes </h4>

In [None]:
columns = ['diabetes_class', 'preg_count', 'glucose_concentration', 'diastolic_bp',
       'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
       'diabetes_pedi', 'age']

In [None]:
df = pd.read_csv('pima_indians_diabetes_all.csv')

In [None]:
# Look for any columns that have NA
df.isna().any(axis=0)

In [None]:
df.describe()

In [None]:
print(df.describe())

In [None]:
#DWB# Investigating data.
df_0_glucose_cncntrtn = df[df['glucose_concentration'] == 0]
print(df_0_glucose_cncntrtn.head())

In [None]:
df['glucose_concentration'].hist()
plt.show()

In [None]:
#DWB#  Attempt at mean with both broups combined,
#DWB#+ need deep copy now.
df_no_group = df.copy(deep=True)
df_no_group['glucose_concentration'].hist()
plt.show()

In [None]:
# Find Summary Statistics for Each Class
# Impute values based on class
# https://stackoverflow.com/questions/19966018/pandas-filling-missing-values-by-mean-in-each-group
group_class = df.groupby('diabetes_class')

In [None]:
# First few rows of each group
group_class.head(2)

In [None]:
#DWB#
print(group_class.head(2))

In [None]:
# Attribute Mean value is different for each group
group_class.mean()

In [None]:
#DWB#
print(group_class.mean())

In [None]:
df['diabetes_class'].value_counts()

In [None]:
# For each group, use group level averages to fill missing values
df['glucose_concentration'] = group_class['glucose_concentration'].transform(lambda x: x.replace(0,x.mean()))
df['diastolic_bp'] = group_class['diastolic_bp'].transform(lambda x: x.replace(0,x.mean()))
df['triceps_skin_fold_thickness'] = group_class['triceps_skin_fold_thickness'].transform(lambda x: x.replace(0,x.mean()))
df['two_hr_serum_insulin'] = group_class['two_hr_serum_insulin'].transform(lambda x: x.replace(0,x.mean()))
df['bmi'] = group_class['bmi'].transform(lambda x: x.replace(0,x.mean()))
df['diabetes_pedi'] = group_class['diabetes_pedi'].transform(lambda x: x.replace(0,x.mean()))
df['age'] = group_class['age'].transform(lambda x: x.replace(0,x.mean()))

In [None]:
df_no_group['glucose_concentration'] = df_no_group['glucose_concentration'].transform(lambda x: x.replace(0,x.mean()))
df_no_group['diastolic_bp'] = df_no_group['diastolic_bp'].transform(lambda x: x.replace(0,x.mean()))
df_no_group['triceps_skin_fold_thickness'] = df_no_group['triceps_skin_fold_thickness'].transform(lambda x: x.replace(0,x.mean()))
df_no_group['two_hr_serum_insulin'] = df_no_group['two_hr_serum_insulin'].transform(lambda x: x.replace(0,x.mean()))
df_no_group['bmi'] = df_no_group['bmi'].transform(lambda x: x.replace(0,x.mean()))
df_no_group['diabetes_pedi'] = df_no_group['diabetes_pedi'].transform(lambda x: x.replace(0,x.mean()))
df_no_group['age'] = df_no_group['age'].transform(lambda x: x.replace(0,x.mean()))

In [None]:
df.head()

In [None]:
#DWB#
print(df.head())

In [None]:
#DWB#
df_no_group.head()

In [None]:
#DWB#
print(df_no_group.head())

In [None]:
#DWB#  Let's see if this made a difference. I'll do histograms,
#DWB#+ below, but I noticed that we do have some differences, e.g.

df_val = df.iloc[2].loc['triceps_skin_fold_thickness']
df_no_group_val = df_no_group.iloc[2].loc['triceps_skin_fold_thickness']

print(f"In df,          the value is: {df_val},")
print(f"In df_no_group, the value is: {df_no_group_val}.")
print("Check: The statement, " + 
      f"'They are the same,' is {df_val == df_no_group_val}.")

df_vals = df.loc[[0:2], 'two_hr_serum_insulin']
df_no_group_vals = df_no_group.loc[[0:2],'two_hr_serum_insulin']

print(df_vals)
print(df_no_group_vals)

In [None]:
#DWB
df['glucose_concentration'].hist()
plt.show()

In [None]:
#DWB#
df_no_group['glucose_concentration'].hist()
plt.show()

In [None]:
# Separate diabetic and normal samples
diabetic = df.diabetes_class == 1
normal = df.diabetes_class == 0

In [None]:
# Glucose concentration histogram
plt.hist(df[diabetic].glucose_concentration,label='diabetic')
plt.hist(df[normal].glucose_concentration,alpha=0.5,label='normal')
plt.title('Glucose Concentration')
plt.xlabel('Glucose concentration')
plt.ylabel('Samples')
plt.legend()
plt.show()

In [None]:
# BMI histogram
plt.hist(df[diabetic].bmi,label='diabetic')
plt.hist(df[normal].bmi,alpha=0.5,label='normal')
plt.title('BMI')
plt.xlabel('BMI')
plt.ylabel('Samples')
plt.legend()
plt.show()

In [None]:
# Age
plt.hist(df[diabetic].age,label='diabetic')
plt.hist(df[normal].age,alpha=0.5,label='normal')
plt.title('Age')
plt.xlabel('Age')
plt.ylabel('Samples')
plt.legend()
plt.show()

## Training and Validation Set
### Target Variable as first column followed by input features:
'diabetes_class', 'preg_count', 'glucose_concentration', 'diastolic_bp',
       'triceps_skin_fold_thickness', 'two_hr_serum_insulin', 'bmi',
       'diabetes_pedi', 'age'
### Training, Validation files do not have a column header

In [None]:
# Training = 70% of the data
# Validation = 30% of the data
# Randomize the datset
np.random.seed(5)
l = list(df.index)
np.random.shuffle(l)
df = df.iloc[l]

In [None]:
rows = df.shape[0]
train = int(.7 * rows)
test = rows - train

In [None]:
rows, train, test

In [None]:
# Write Training Set
df[:train].to_csv('diabetes_train.csv'
                          ,index=False,index_label='Row',header=False
                          ,columns=columns)

In [None]:
# Write Validation Set
df[train:].to_csv('diabetes_validation.csv'
                          ,index=False,index_label='Row',header=False
                          ,columns=columns)

In [None]:
# Write Column List
with open('diabetes_train_column_list.txt','w') as f:
    f.write(','.join(columns))