# Importing Data and preprocessing

In [None]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
df = pd.read_csv(r"Data\train.csv")
df2 = pd.read_csv(r"Data\healthcare-dataset-stroke-data.csv")
df2 = df2[df2["stroke"] == 1]
df = pd.concat([df,df2])
df = df.drop(["id"],axis =1)
before = df.shape[0]
df.drop_duplicates(inplace = True)
after = df.shape[0]
print(before-after , " Removed duplicate rows")
df.head()

In [None]:
print("Number of columns before One-hot encoding: ",df.shape[1]-1)
df = pd.get_dummies(df)
print("Number of columns after One-hot encodeing: ",df.shape[1]-1)


In [None]:
# Checking NULL values
df = df.dropna()
df.isna().sum()

In [None]:
from sklearn.preprocessing import MinMaxScaler

numerical = ['age','avg_glucose_level','bmi']
scaler = MinMaxScaler()
df[numerical] = scaler.fit_transform(df[numerical])
df.head()

In [None]:
from sklearn.model_selection import train_test_split
all_features = df.drop(["stroke"],axis = 1)
labels = df["stroke"]
all_features.head()

train_features, test_features, train_labels, test_labels = train_test_split(all_features,
                                                                            labels,
                                                                            train_size = 0.8)



# Trying each model by it's own

## Decision Tree model

## Logistic regression model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

linearClf = LogisticRegression( max_iter = 300, C=0.9)

linearClf.fit(train_features,train_labels)
linearPred = linearClf.predict_proba(test_features)
roc_auc_score(test_labels,linearPred[:,1])


# Cross-Validation pipeline

In [None]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, make_scorer
avg_score = 0

n_splits = 5
cv = KFold(n_splits= n_splits, shuffle= True)

roc_auc_scorer = make_scorer(roc_auc_score, greater_is_better=True,
                             needs_threshold=True)

for i,(train_index,test_index) in enumerate(cv.split(train_features,train_labels)):
    print("Fold number: ",i+1)
    print()
    model1 = LogisticRegressionCV(scoring = roc_auc_scorer, penalty = 'l1', solver = 'saga', max_iter = 1000)
    model1.fit(train_features.iloc[train_index], train_labels.iloc[train_index])
    prediction1 = model1.predict_proba(train_features.iloc[test_index])[:,1]
    
    score = roc_auc_score(train_labels.iloc[test_index], prediction1)
    
    print("Score: ", score)
    
    avg_score+=score
    print()

avg_score /= n_splits
print("Average_score: ", avg_score)

## Testing on my test data with final predictor

In [None]:
import numpy as np
predictions = model1.predict_proba(test_features) 
predictions = np.array(predictions)
predictions.shape
#predictions = predictions.T.mean(axis=1)
predictions = predictions[:,1]
predictions

In [None]:
roc_auc_score(test_labels,predictions)

In [None]:

linearPred = linearClf.predict_proba(test_features)
#treePred = treeClf.predict_proba(test_features)

In [None]:
#roc_auc_score(test_labels,final_prediction)
roc_auc_score(test_labels,linearPred[:,1])

## Importing Test data

In [None]:
sub = pd.read_csv(r"Data\sample_submission.csv")
test_data = pd.read_csv(r"Data\test.csv")
test_data = test_data.drop(["id"], axis = 1)
test_data = pd.get_dummies(test_data)
test_data[numerical] = scaler.transform(test_data[numerical])
test_data.head()


## Final prediction and submit

In [None]:
predictions = model1.predict_proba(test_data)
predictions = np.array(predictions)
predictions.shape
predictions = predictions.T.mean
final_prediction = predictions

In [None]:
plt.hist(final_prediction)
plt.xlabel("Probability of stroke")
plt.ylabel("Number of people")
plt.show()

In [None]:
final_prediction = linearClf.predict_proba(test_data)[:,1]

In [None]:
sub["stroke"] = final_prediction
sub.to_csv("submission.csv", index = False)