In [168]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, StratifiedKFold
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import OneHotEncoder

sns.set()

In [169]:
train = pd.read_csv('train.csv')
rw = pd.read_csv('healthcare-dataset-stroke-data.csv')
test = pd.read_csv('test.csv')

In [170]:
data_raw = pd.concat([train, rw]).drop_duplicates()
data_raw.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [171]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20414 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 20414 non-null  int64  
 1   gender             20414 non-null  object 
 2   age                20414 non-null  float64
 3   hypertension       20414 non-null  int64  
 4   heart_disease      20414 non-null  int64  
 5   ever_married       20414 non-null  object 
 6   work_type          20414 non-null  object 
 7   Residence_type     20414 non-null  object 
 8   avg_glucose_level  20414 non-null  float64
 9   bmi                20213 non-null  float64
 10  smoking_status     20414 non-null  object 
 11  stroke             20414 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 2.0+ MB


In [172]:
data_raw.describe(include='all')

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,20414.0,20414,20414.0,20414.0,20414.0,20414,20414,20414,20414.0,20213.0,20414,20414.0
unique,,3,,,,2,5,2,,,4,
top,,Female,,,,Yes,Private,Urban,,,never smoked,
freq,,12440,,,,13738,12677,10236,,,8173,
mean,14877.273636,,41.87051,0.061673,0.031008,,,,93.322256,28.30228,,0.043157
std,16825.306948,,21.756482,0.240567,0.173344,,,,32.476351,7.021765,,0.203215
min,0.0,,0.08,0.0,0.0,,,,55.12,10.3,,0.0
25%,4766.25,,25.0,0.0,0.0,,,,75.22,23.5,,0.0
50%,9511.5,,43.0,0.0,0.0,,,,86.25,27.7,,0.0
75%,14279.75,,58.0,0.0,0.0,,,,99.73,32.2,,0.0


In [173]:
categorical = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status','hypertension', 'heart_disease']
numerical = ['age', 'avg_glucose_level', 'bmi','age/bmi','age*bmi','bmi/prime','obesity']
target = ['stroke']
all = categorical + numerical

In [174]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20414 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 20414 non-null  int64  
 1   gender             20414 non-null  object 
 2   age                20414 non-null  float64
 3   hypertension       20414 non-null  int64  
 4   heart_disease      20414 non-null  int64  
 5   ever_married       20414 non-null  object 
 6   work_type          20414 non-null  object 
 7   Residence_type     20414 non-null  object 
 8   avg_glucose_level  20414 non-null  float64
 9   bmi                20213 non-null  float64
 10  smoking_status     20414 non-null  object 
 11  stroke             20414 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 2.0+ MB


In [175]:
data = data_raw.copy()
data['bmi'] = data['bmi'].fillna(data['bmi'].median())
data[categorical] = data[categorical].astype('category') 


In [176]:
data['age/bmi'] = data.age / data.bmi
data['age*bmi'] = data.age * data.bmi
data['bmi/prime'] = data.bmi / 25
data['obesity'] = data.avg_glucose_level * data.bmi / 1000


In [177]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20414 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 20414 non-null  int64  
 1   gender             20414 non-null  object 
 2   age                20414 non-null  float64
 3   hypertension       20414 non-null  int64  
 4   heart_disease      20414 non-null  int64  
 5   ever_married       20414 non-null  object 
 6   work_type          20414 non-null  object 
 7   Residence_type     20414 non-null  object 
 8   avg_glucose_level  20414 non-null  float64
 9   bmi                20213 non-null  float64
 10  smoking_status     20414 non-null  object 
 11  stroke             20414 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 2.0+ MB


In [178]:
test['age/bmi'] = test.age / test.bmi
test['age*bmi'] = test.age * test.bmi
test['bmi/prime'] = test.bmi / 25
test['obesity'] = test.avg_glucose_level * test.bmi / 1000


In [179]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age/bmi,age*bmi,bmi/prime,obesity
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0,0.900322,870.8,1.244,2.473383
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0,1.380753,788.7,0.956,1.874716
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0,1.042184,1692.6,1.612,4.1509
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0,1.944444,1612.8,1.152,1.868256
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0,0.833333,691.2,1.152,2.112768


In [180]:
X_tr , X_val, y_tr, y_val = train_test_split(data[all], data[target], test_size=0.2)

In [181]:
from xgboost import XGBClassifier
# create and train the XGBoost classifier
params = {
    'max_depth': 3,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.5,
    'lambda': 1.5,
    'alpha': 0.5,
    'min_child_weight': 5,
    'max_delta_step': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}


In [182]:

encoder = OneHotEncoder(drop_last=True, variables= categorical)
scaler = SklearnTransformerWrapper(StandardScaler(), variables= numerical)

In [183]:
X_tr = encoder.fit_transform(X_tr)
X_tr = scaler.fit_transform(X_tr)

X_val = encoder.transform(X_val)
X_val = scaler.transform(X_val)

In [184]:
X_tr

Unnamed: 0,age,avg_glucose_level,bmi,age/bmi,age*bmi,bmi/prime,obesity,gender_Female,gender_Male,ever_married_Yes,work_type_Private,work_type_Self-employed,work_type_Govt_job,work_type_children,Residence_type_Rural,smoking_status_formerly smoked,smoking_status_Unknown,smoking_status_never smoked,hypertension_0,heart_disease_0
3599,0.923726,-0.385520,-0.196667,1.048319,0.593400,-0.196667,-0.376937,1,0,1,1,0,0,0,1,1,0,0,1,1
283,0.740331,1.738067,-0.182335,0.851024,0.452117,-0.182335,1.045775,1,0,1,1,0,0,0,0,0,1,0,1,1
10499,-0.543438,-0.076349,1.422813,-0.865002,-0.130780,1.422813,0.608114,0,1,1,1,0,0,0,1,0,0,1,1,1
1033,-0.360042,0.468160,0.304942,-0.445592,-0.286774,0.304942,0.478963,0,1,0,1,0,0,0,0,0,0,0,1,1
4698,0.052598,-0.792210,-1.128227,0.800273,-0.503833,-1.128227,-0.980692,1,0,1,0,1,0,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12974,0.419389,0.597673,0.304942,0.258557,0.430466,0.304942,0.576237,0,1,1,1,0,0,0,1,0,0,0,1,1
4948,-0.497589,0.740107,0.018308,-0.479429,-0.499392,0.018308,0.504893,1,0,1,1,0,0,0,0,0,0,1,1,1
7792,-1.506264,0.668737,-1.572509,-1.198824,-1.505166,-1.572509,-0.515296,0,1,0,0,0,0,1,0,0,1,0,1,1
2303,1.519761,3.880419,0.734892,0.973620,1.755306,0.734892,3.542044,1,0,1,0,0,1,0,1,0,0,0,1,1


In [185]:
X_val

Unnamed: 0,age,avg_glucose_level,bmi,age/bmi,age*bmi,bmi/prime,obesity,gender_Female,gender_Male,ever_married_Yes,work_type_Private,work_type_Self-employed,work_type_Govt_job,work_type_children,Residence_type_Rural,smoking_status_formerly smoked,smoking_status_Unknown,smoking_status_never smoked,hypertension_0,heart_disease_0
131,0.144295,0.014710,0.376600,-0.020127,0.208549,0.376600,0.174010,0,1,1,1,0,0,0,1,0,0,1,1,1
4124,0.465237,-0.572560,-0.039019,0.484595,0.299453,-0.039019,-0.438841,1,0,1,1,0,0,0,0,0,0,1,1,1
10687,0.648633,-0.343374,1.064521,0.121301,1.053333,1.064521,0.200122,1,0,1,1,0,0,0,1,0,0,1,1,1
12582,1.657308,-0.442124,-0.698276,2.343391,0.811847,-0.698276,-0.624389,1,0,1,1,0,0,0,1,0,1,0,1,1
4173,-1.689659,-0.738682,-1.658499,-1.476889,-1.605368,-1.658499,-1.148427,0,1,0,0,0,0,1,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4,-0.818531,-0.611937,0.075635,-0.804570,-0.761973,0.075635,-0.421476,1,0,0,1,0,0,0,1,0,0,1,1,1
3204,0.556935,-0.367062,-0.139340,0.636804,0.324712,-0.139340,-0.339948,0,1,1,1,0,0,0,0,0,0,1,1,1
10195,0.144295,-0.627934,1.465808,-0.382116,0.683193,1.465808,0.104149,1,0,1,0,0,1,0,1,0,0,0,1,1
1000,1.290517,-0.591018,0.118630,1.175076,1.105793,0.118630,-0.389709,0,1,1,1,0,0,0,1,0,0,1,0,1


In [186]:
clf = XGBClassifier(**params)
clf.fit(X_tr, y_tr)

val_preds = clf.predict_proba(X_val)[:, 1]
print(mean_absolute_error(y_val, val_preds))



0.07177955182515561


In [187]:
X_test = test[all].copy()

X_test = encoder.transform(X_test)
X_test = scaler.transform(X_test)

preds = clf.predict_proba(X_test)[:, 1]
output = pd.DataFrame({'id': test['id'], 'stroke' : preds})
output.to_csv('yeah.csv', index= False)
