# Importing Data and preprocessing

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv("/kaggle/input/playground-series-s3e2/train.csv")
df2 = pd.read_csv("/kaggle/input/stroke-prediction-dataset/healthcare-dataset-stroke-data.csv")
df2 = df2[df2["stroke"] == 1]
df = pd.concat([df,df2])
df = df.drop(["id"],axis =1)
before = df.shape[0]
df.drop_duplicates(inplace = True)
after = df.shape[0]
print(before-after , " Removed duplicate rows")
df.head()

In [None]:
print("Number of columns before One-hot encoding: ",df.shape[1]-1)
df = pd.get_dummies(df)
print("Number of columns after One-hot encodeing: ",df.shape[1]-1)


In [None]:
# Checking NULL values
df = df.dropna()
df.isna().sum()

In [None]:
from sklearn.preprocessing import MinMaxScaler

numerical = ['age','avg_glucose_level','bmi']
scaler = MinMaxScaler()
df[numerical] = scaler.fit_transform(df[numerical])
df.head()

In [None]:
from sklearn.model_selection import train_test_split
all_features = df.drop(["stroke"],axis = 1)
labels = df["stroke"]
all_features.head()

train_features, test_features, train_labels, test_labels = train_test_split(all_features,
                                                                            labels,
                                                                            train_size = 0.8,
                                                                            random_state = 5)



# Trying each model by it's own

## Decision Tree model

In [None]:
# Building models and see the result without normalizing 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer

treeClassifier = RandomForestClassifier( random_state = 42)

hyperparams = {'n_estimators':[120,140,160,100],'min_samples_split':[18,16,22,20]}
roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
                             needs_threshold=True)

grid_obj = GridSearchCV(treeClassifier , hyperparams ,scoring = roc_auc_scorer )
grid_fit = grid_obj.fit(train_features, train_labels)

treeClf = grid_fit.best_estimator_
print(grid_fit.best_params_)


In [None]:
treePred = treeClf.predict_proba(test_features)
roc_auc_score(test_labels,treePred[:,1])

## Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression

linearClf = LogisticRegression( max_iter = 300, C=0.9)

linearClf.fit(train_features,train_labels)
linearPred = linearClf.predict_proba(test_features)
roc_auc_score(test_labels,linearPred[:,1])


## XGBoost Model

In [None]:
# from xgboost import XGBClassifier
# from sklearn.metrics import roc_auc_score

# xgbClf = XGBClassifier(n_estimators = 500, max_depth = 4,random_state = 5)
# xgbClf.fit(train_features, train_labels)

# xgbPred = xgbClf.predict_proba(test_features)
# roc_auc_score(test_labels,xgbPred[:,1])

In [None]:
# from matplotlib import pyplot as plt
# from sklearn.metrics import roc_curve
# fpr, tpr, _ = roc_curve(test_labels,xgbPred[:,1])
# plt.plot(fpr, tpr, 'b-', label='XGBoost')
# plt.xlabel('False Positive Rate')
# plt.ylabel('True Positive Rate')
# plt.plot([0,1],[0,1],'g--')
# plt.legend()
# plt.show()

## Lasso Regession

In [None]:
# from sklearn.linear_model import Lasso

# clf = Lasso()
# clf.fit(train_features, train_labels)

# Cross-Validation pipeline

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, make_scorer
all_models = []
avg_score = 0
cv = KFold(n_splits= 5, random_state = 5, shuffle= True)

roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
                             needs_threshold=True)


for i,(train_index,test_index) in enumerate(cv.split(train_features,train_labels)):
    print("Fold number: ",i+1)
    print()
    model1 = LogisticRegressionCV(scoring = roc_auc_scorer, penalty = 'l1', solver = 'saga', max_iter = 1000)
    model1.fit(train_features.iloc[train_index], train_labels.iloc[train_index])
    prediction1 = model1.predict_proba(train_features.iloc[test_index])[:,1]
    
#     model2 = RandomForestClassifier( random_state = 42, min_samples_split= 22, n_estimators=120)
#     model2.fit(train_features.iloc[train_index], train_labels.iloc[train_index])
#     prediction2 = model2.predict_proba(train_features.iloc[test_index])[:,1]
    
    score = roc_auc_score(train_labels.iloc[test_index], prediction1)
#     score += roc_auc_score(train_labels.iloc[test_index], prediction2)
#     score /= 2
    all_models.append(model1)
    print("Score: ", score)
    avg_score+=score
    print()

avg_score /= 5
print("Average_score: ", avg_score)

## Testing on my test data with final predictor

In [None]:
import numpy as np
predictions = [model.predict_proba(test_features)[:,1] for model in all_models]
predictions = np.array(predictions)
predictions.shape
predictions = predictions.T.mean(axis=1)
predictions.shape

In [None]:
roc_auc_score(test_labels,predictions)

In [None]:

linearPred = linearClf.predict_proba(test_features)
treePred = treeClf.predict_proba(test_features)
final_prediction = [ (a+b)/2 for a,b in zip(linearPred[:,1],treePred[:,1])]

In [None]:
roc_auc_score(test_labels,final_prediction)

## Importing Test data

In [None]:
sub = pd.read_csv("/kaggle/input/playground-series-s3e2/sample_submission.csv")
test_data = pd.read_csv("/kaggle/input/playground-series-s3e2/test.csv")
test_data = test_data.drop(["id"], axis = 1)
test_data = pd.get_dummies(test_data)
test_data[numerical] = scaler.transform(test_data[numerical])
test_data.head()


## Final prediction and submit

In [None]:
predictions = [model.predict_proba(test_data)[:,1] for model in all_models]
predictions = np.array(predictions)
predictions.shape
predictions = predictions.T.mean(axis=1)
final_prediction = predictions

In [None]:
plt.hist(final_prediction);

In [None]:
final_prediction = linearClf.predict_proba(test_data)[:,1]

In [None]:
sub["stroke"] = final_prediction
sub.to_csv("submission.csv", index = False)