In [63]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, mean_absolute_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, StratifiedKFold
from feature_engine.wrappers import SklearnTransformerWrapper
from feature_engine.encoding import OneHotEncoder

sns.set()

In [64]:
# not for test d s 

In [65]:
train = pd.read_csv('../data/train.csv')
rw = pd.read_csv('../data/healthcare-dataset-stroke-data.csv')
test = pd.read_csv('../data/test.csv')

In [66]:
data_raw = pd.concat([train, rw]).drop_duplicates()
data_raw.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0


In [67]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20414 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 20414 non-null  int64  
 1   gender             20414 non-null  object 
 2   age                20414 non-null  float64
 3   hypertension       20414 non-null  int64  
 4   heart_disease      20414 non-null  int64  
 5   ever_married       20414 non-null  object 
 6   work_type          20414 non-null  object 
 7   Residence_type     20414 non-null  object 
 8   avg_glucose_level  20414 non-null  float64
 9   bmi                20213 non-null  float64
 10  smoking_status     20414 non-null  object 
 11  stroke             20414 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 2.0+ MB


In [68]:
data_raw.describe(include='all')

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
count,20414.0,20414,20414.0,20414.0,20414.0,20414,20414,20414,20414.0,20213.0,20414,20414.0
unique,,3,,,,2,5,2,,,4,
top,,Female,,,,Yes,Private,Urban,,,never smoked,
freq,,12440,,,,13738,12677,10236,,,8173,
mean,14877.273636,,41.87051,0.061673,0.031008,,,,93.322256,28.30228,,0.043157
std,16825.306948,,21.756482,0.240567,0.173344,,,,32.476351,7.021765,,0.203215
min,0.0,,0.08,0.0,0.0,,,,55.12,10.3,,0.0
25%,4766.25,,25.0,0.0,0.0,,,,75.22,23.5,,0.0
50%,9511.5,,43.0,0.0,0.0,,,,86.25,27.7,,0.0
75%,14279.75,,58.0,0.0,0.0,,,,99.73,32.2,,0.0


In [69]:
categorical = ['gender', 'ever_married', 'work_type', 'Residence_type', 'smoking_status','hypertension', 'heart_disease']
numerical = ['age', 'avg_glucose_level', 'bmi','age/bmi','age*bmi','bmi/prime','obesity']
target = ['stroke']
all = categorical + numerical

In [70]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20414 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 20414 non-null  int64  
 1   gender             20414 non-null  object 
 2   age                20414 non-null  float64
 3   hypertension       20414 non-null  int64  
 4   heart_disease      20414 non-null  int64  
 5   ever_married       20414 non-null  object 
 6   work_type          20414 non-null  object 
 7   Residence_type     20414 non-null  object 
 8   avg_glucose_level  20414 non-null  float64
 9   bmi                20213 non-null  float64
 10  smoking_status     20414 non-null  object 
 11  stroke             20414 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 2.0+ MB


In [71]:
data = data_raw.copy()
data['bmi'] = data['bmi'].fillna(data['bmi'].median())
data[categorical] = data[categorical].astype('category') 


In [72]:
data['age/bmi'] = data.age / data.bmi
data['age*bmi'] = data.age * data.bmi
data['bmi/prime'] = data.bmi / 25
data['obesity'] = data.avg_glucose_level * data.bmi / 1000


In [73]:
data_raw.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20414 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 20414 non-null  int64  
 1   gender             20414 non-null  object 
 2   age                20414 non-null  float64
 3   hypertension       20414 non-null  int64  
 4   heart_disease      20414 non-null  int64  
 5   ever_married       20414 non-null  object 
 6   work_type          20414 non-null  object 
 7   Residence_type     20414 non-null  object 
 8   avg_glucose_level  20414 non-null  float64
 9   bmi                20213 non-null  float64
 10  smoking_status     20414 non-null  object 
 11  stroke             20414 non-null  int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 2.0+ MB


In [74]:
test['age/bmi'] = test.age / test.bmi
test['age*bmi'] = test.age * test.bmi
test['bmi/prime'] = test.bmi / 25
test['obesity'] = test.avg_glucose_level * test.bmi / 1000


In [75]:
data.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke,age/bmi,age*bmi,bmi/prime,obesity
0,0,Male,28.0,0,0,Yes,Private,Urban,79.53,31.1,never smoked,0,0.900322,870.8,1.244,2.473383
1,1,Male,33.0,0,0,Yes,Private,Rural,78.44,23.9,formerly smoked,0,1.380753,788.7,0.956,1.874716
2,2,Female,42.0,0,0,Yes,Private,Rural,103.0,40.3,Unknown,0,1.042184,1692.6,1.612,4.1509
3,3,Male,56.0,0,0,Yes,Private,Urban,64.87,28.8,never smoked,0,1.944444,1612.8,1.152,1.868256
4,4,Female,24.0,0,0,No,Private,Rural,73.36,28.8,never smoked,0,0.833333,691.2,1.152,2.112768


In [76]:
X_tr , X_val, y_tr, y_val = train_test_split(data[all], data[target], test_size=0.2)

In [77]:
from xgboost import XGBClassifier
# create and train the XGBoost classifier
params = {
    'max_depth': 3,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.5,
    'lambda': 1.5,
    'alpha': 0.5,
    'min_child_weight': 5,
    'max_delta_step': 1,
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}

model = XGBClassifier(**params)

In [78]:
# from sklearn.ensemble import RandomForestClassifier

# # create and train the Random Forest classifier
# params = {
#     'n_estimators': 100,
#     'max_depth': 3,
#     'min_samples_split': 3,
#     'min_samples_leaf': 3,
#     'bootstrap': True,
#     'random_state': 42
# }
# model = RandomForestClassifier(**params)


In [79]:

encoder = OneHotEncoder(drop_last=True, variables= categorical)
scaler = SklearnTransformerWrapper(StandardScaler(), variables= numerical)

In [80]:
X_tr = encoder.fit_transform(X_tr)
X_tr = scaler.fit_transform(X_tr)

X_val = encoder.transform(X_val)
X_val = scaler.transform(X_val)

In [81]:
X_tr

Unnamed: 0,age,avg_glucose_level,bmi,age/bmi,age*bmi,bmi/prime,obesity,gender_Male,gender_Female,ever_married_Yes,work_type_Private,work_type_Self-employed,work_type_children,work_type_Govt_job,Residence_type_Rural,smoking_status_never smoked,smoking_status_Unknown,smoking_status_formerly smoked,hypertension_0,heart_disease_0
10261,0.415950,-0.741144,-0.188315,0.520875,0.185119,-0.188315,-0.610419,1,0,1,1,0,0,0,1,1,0,0,1,1
2981,0.922231,3.418706,0.112829,0.825484,0.779795,0.112829,2.489951,0,1,1,1,0,0,0,1,0,1,0,1,1
13166,0.922231,-0.307130,0.170190,0.789075,0.814317,0.170190,-0.161763,0,1,1,0,1,0,0,0,1,0,0,1,1
14280,-0.688664,0.137728,-0.934005,-0.298769,-0.912359,-0.934005,-0.412942,0,1,0,1,0,0,0,0,1,0,0,1,1
14656,-1.793277,-0.149757,-1.407232,-1.655318,-1.654451,-1.407232,-0.790793,0,1,0,0,0,1,0,1,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4969,0.185822,-0.289162,-0.173975,0.279563,0.003598,-0.173975,-0.302667,0,1,1,0,0,0,1,1,1,0,0,1,1
6292,-1.471098,0.553465,-0.059254,-1.407971,-1.343332,-0.059254,0.329166,1,0,0,1,0,0,0,0,0,0,1,1,1
7606,-1.609175,-0.200252,-1.120428,-1.429345,-1.531953,-1.120428,-0.681142,0,1,0,0,0,1,0,1,0,1,0,1,1
4788,0.185822,-0.042260,2.765765,-0.641754,1.316284,2.765765,1.302887,1,0,1,1,0,0,0,0,0,0,0,1,1


In [82]:
X_val

Unnamed: 0,age,avg_glucose_level,bmi,age/bmi,age*bmi,bmi/prime,obesity,gender_Male,gender_Female,ever_married_Yes,work_type_Private,work_type_Self-employed,work_type_children,work_type_Govt_job,Residence_type_Rural,smoking_status_never smoked,smoking_status_Unknown,smoking_status_formerly smoked,hypertension_0,heart_disease_0
13146,-1.793277,-0.368778,-0.790603,-1.693862,-1.636494,-0.790603,-0.623394,0,1,0,0,0,1,0,1,0,1,0,1,1
1195,-0.872766,-0.861033,-0.589841,-0.661886,-0.956904,-0.589841,-0.830468,0,1,0,1,0,0,0,0,1,0,0,1,1
8743,0.461976,0.188533,-0.130955,0.532119,0.251658,-0.130955,0.039778,1,0,1,0,1,0,0,0,1,0,0,1,1
14989,-1.793277,-0.618468,-1.091748,-1.677039,-1.645264,-1.091748,-0.881634,0,1,0,0,0,1,0,1,0,1,0,1,1
4821,-0.366485,-0.764688,0.198870,-0.416924,-0.326035,0.198870,-0.484577,0,1,1,0,1,0,0,1,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11249,0.692103,0.019698,0.112829,0.608937,0.577254,0.112829,0.047592,0,1,1,1,0,0,0,1,0,1,0,1,1
4967,-0.366485,0.093738,-0.804944,0.027983,-0.657339,-0.804944,-0.370365,1,0,1,1,0,0,0,1,1,0,0,1,1
4785,-1.701226,0.442560,-1.306850,-1.531488,-1.598074,-1.306850,-0.462654,1,0,0,0,0,1,0,0,0,1,0,1,1
13587,0.277873,0.130912,1.417786,-0.276068,0.820720,1.417786,0.805785,0,1,1,1,0,0,0,1,1,0,0,1,1


In [83]:

model.fit(X_tr, y_tr)

val_preds = model.predict_proba(X_val)[:, 1]
print(mean_absolute_error(y_val, val_preds))



0.07173364194604552


In [84]:
X_test = test[all].copy()

X_test = encoder.transform(X_test)
X_test = scaler.transform(X_test)

preds = model.predict_proba(X_test)[:, 1]
output = pd.DataFrame({'id': test['id'], 'stroke' : preds})
output.to_csv('yeah.csv', index= False)
