In [107]:
# Start Python Imports
import math, time, random, datetime

# Data Manipulation
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import missingno
import seaborn as sns
plt.style.use('seaborn-whitegrid')

# Import train & test data 
train = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')

In [108]:
from word2number import w2n
def getGender(x):
    return 1 if x.lower() == 'm' else (0 if x.lower() == 'f' else (2 if x.lower() == 'other' else ''))

def splitAgeGender(x, ret_age=True):
    if not x or str(x) == 'nan':
        return ''
    
    parts = str(x).replace(',', ' ').split()
    age = gender = ''
    if ret_age:
        try:
            age = w2n.word_to_num(parts[0])
        except ValueError:
            try:
                age = float(parts[0])
            except ValueError:
                pass
        if not age and len(parts) > 1:
            try:
                age = w2n.word_to_num(parts[1])
            except ValueError:
                try:
                    age = float(parts[1])
                except ValueError:
                    pass
        return age
    else:
        gender = getGender(parts[0])
        if gender == '' and len(parts) > 1:
            gender = getGender(parts[1])
        return gender
            
# Added - Job and location
def split_job_livivng(line, job = True):
    if not line or str(line) == 'nan':
        return 'AAA'
    
    part_1, part_2 = line.split('?')[:2:]
    if job:
        if 'gov' in part_1.lower() or 'gov' in part_2.lower():
            part_1 = 'GOVERNMENT'
        elif 'pri' in part_1.lower() or 'pri' in part_2.lower():
            part_1 = 'PRIVATE'
        elif 'bus' in part_1.lower() or 'bus' in part_2.lower() or 'biz' in part_1.lower() or 'biz' in part_2.lower():
            part_1 = 'BUSINESS'
        elif 'parent' in part_1.lower() or 'parent' in part_2.lower():
            part_1 = 'PARENTAL_LEAVE'
        elif 'unemp' in part_1.lower() or 'unemp' in part_2.lower():
            part_1 = 'UNEMPLOYED'
        else:
            part_1 = 'AAA'
        return part_1

    else:    
        if 'city' in part_1.lower() or 'city' in part_2.lower():
            part_2 = 'CITY'
        elif 'remo' in part_1.lower() or 'remo' in part_2.lower():
            part_2 = 'REMOTE'        
        elif part_1 == 'c' or part_2 == 'c':
            part_2 = 'CITY'
        elif part_1 == 'r' or part_2 == 'r':
            part_2 = 'REMOTE'
        else:
            part_2 = 'AAA'
        return part_2

def getSmokeStatus(x):
    x = str(x).lower()
    x = ''.join([i for i in x if i.isalpha()])
    return 1 if 'non' in x else (2 if 'quit' in x else (3 if 'active' in x else 0))

def fixBmi(x):
    x = str(x)
    if x == 'nan' or x == '?' or x == '.':
        x = 0
        
    return float(x)

def discreteBmi(x):
    if x < 0.5:
        return 0
    elif x < 18.5:
        return 1
    elif x < 25:
        return 2
    elif x < 30:
        return 3
    elif x < 35:
        return 4
    elif x < 40:
        return 5
    return 6

def discreteBloodSugar(x):
    if x < 70:
        return 1
    elif x < 120:
        return 2
    elif x < 200:
        return 3
    elif x < 280:
        return 3
    return 4

def cleanBinary(x, flip=False):
    val = x
    try:
        val = int(x)
        if flip:
            val = 1 if val == 1 else 0
        else:
            val = 0 if val == 0 else 1
    except ValueError:
        val = ''
    return val

def checkTreated(x):
    if str(x['TreatmentA']) == 'nan':
        return 0
    return 1 if (x['TreatmentA'] == 1 or x['TreatmentB'] == 1 or x['TreatmentC'] == 1 or x['TreatmentD_2'] == 1) else 0

def bmiMean(x, m):
    if x > 0.5:
        return x
    return m

In [109]:
old_df = test
new_df = pd.DataFrame(old_df)

new_df['sex'] = old_df['sex and age'].apply(lambda x: splitAgeGender(x, False))
new_df['age'] = old_df['sex and age'].apply(lambda x: splitAgeGender(x, True))

new_df['job'] = old_df['job_status and living_area'].apply(lambda x: split_job_livivng(x, True))
new_df['location'] = old_df['job_status and living_area'].apply(lambda x: split_job_livivng(x, False))

In [110]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

job = new_df['job']
job_encoded = encoder.fit_transform(job)

print(new_df['job'].unique())
print(encoder.classes_)
print(job_encoded)

# let's add this array to dataframe...
new_df['job_encoded'] = job_encoded

# ################

location = new_df['location']
location_encoded = encoder.fit_transform(location)

# let's add this array to dataframe...
new_df['location_encoded'] = job_encoded

['PRIVATE' 'GOVERNMENT' 'BUSINESS' 'PARENTAL_LEAVE' 'UNEMPLOYED' 'AAA']
['AAA' 'BUSINESS' 'GOVERNMENT' 'PARENTAL_LEAVE' 'PRIVATE' 'UNEMPLOYED']
[4 2 1 ... 4 4 2]


In [111]:
new_df['smoker_status_2'] = old_df['smoker_status'].apply(getSmokeStatus)

new_df['BMI'] = old_df['BMI'].apply(fixBmi)
new_df['BMI_2'] = new_df['BMI'].apply(discreteBmi)

#new_df_2 = new_df[new_df['stroke_in_2018'].isin(['1', '0'])]
#print(new_df_2['stroke_in_2018'].unique())
new_df_2 = new_df
new_df_2['average_blood_sugar_2'] = new_df_2['average_blood_sugar'].apply(discreteBloodSugar)

new_df_2['high_BP_2'] = new_df_2['high_BP'].apply(cleanBinary)
new_df_2['heart_condition_detected_2017_2'] = new_df_2['heart_condition_detected_2017'].apply(cleanBinary)
new_df_2['married_2'] = new_df_2['married'].apply(cleanBinary)
new_df_2['TreatmentD_2'] = new_df_2['TreatmentD'].apply(cleanBinary)
new_df_2['treated'] = new_df_2.apply(lambda row: checkTreated(row), axis=1)
new_df_3 = new_df_2[['id', 'sex', 'age', 'sex and age', 'high_BP_2', 'heart_condition_detected_2017_2', 'married_2', 'job_status and living_area', 'average_blood_sugar', 'average_blood_sugar_2', 'BMI', 'BMI_2', 'smoker_status', 'smoker_status_2', 'TreatmentA', 'TreatmentB', 'TreatmentC', 'TreatmentD_2', 'treated']]
#new_df_3 = new_df_2[['id', 'sex', 'age', 'sex and age', 'high_BP_2', 'heart_condition_detected_2017_2', 'married_2', 'job_status and living_area', 'average_blood_sugar', 'average_blood_sugar_2', 'BMI', 'BMI_2', 'smoker_status', 'smoker_status_2', 'TreatmentA', 'TreatmentB', 'TreatmentC', 'TreatmentD_2', 'treated', 'stroke_in_2018']]

new_df_3.to_csv('./Data/test_2.csv')

In [112]:
new_df_3 = pd.read_csv('./Data/test_2.csv')
BMI_mean = new_df_3['BMI'].mean()
print(BMI_mean)
new_df_3['BMI_3'] = new_df_3['BMI'].apply(lambda x: bmiMean(x, BMI_mean))
new_df_3.to_csv('./Data/test_2.csv')

29.75997935306263


In [113]:
all_is_null = zip(new_df_3.isnull(), new_df_3.isnull().sum())
for is_null in all_is_null:
    if is_null[1] > 0:
        print(f'{is_null[0]}: {is_null[1]}')

sex: 14
age: 10
sex and age: 10
high_BP_2: 11
heart_condition_detected_2017_2: 5
married_2: 2
job_status and living_area: 10
average_blood_sugar: 11
smoker_status: 2697
TreatmentA: 8302
TreatmentB: 8302
TreatmentC: 8302
TreatmentD_2: 8302


In [114]:
print(new_df_3['sex'].mean())
print(new_df_3['age'].mean())
print(new_df_3['high_BP_2'].mean())
print(new_df_3['married_2'].mean())
print(new_df_3['TreatmentA'].mean())

0.404296875
45.01434542949013
0.09325829792121282
0.6476594768242313
0.1875


In [115]:
new_df_3['sex'].fillna(0, inplace=True)
new_df_3['age'].fillna(new_df_3['age'].mean(), inplace=True)
new_df_3['high_BP_2'].fillna(0, inplace=True)
new_df_3['heart_condition_detected_2017_2'].fillna(0, inplace=True)
new_df_3['married_2'].fillna(1, inplace=True)
new_df_3['TreatmentA'].fillna(0, inplace=True)
new_df_3['TreatmentB'].fillna(0, inplace=True)
new_df_3['TreatmentC'].fillna(0, inplace=True)
new_df_3['TreatmentD_2'].fillna(0, inplace=True)

In [116]:
new_df_3.to_csv('./Data/test_2.csv')