In [None]:
import numpy as np # Mathematical Calculations
import pandas as pd # Data operations and summaries
import seaborn as sns # Good data visualizations
import matplotlib.pyplot as plt # basic Visualizations
import statsmodels.api as sm # Statistical Models
import warnings
warnings.simplefilter(action='ignore')
from scipy import stats

In [None]:
heart = pd.read_csv("heart.csv")

# Preliminary Checks

In [None]:
len(heart) # Number of Rows in the data

In [None]:
heart.shape # Number of rows and columns

In [None]:
heart.head(10)# Extract the top n observations from the data

In [None]:
heart.tail(10) # Extracts the last n observations

In [None]:
heart.sample(10) # Randomly checking some observations in the middle

In [None]:
heart.sample(frac = 0.01)# Extract randomly some defined percentage of observations

In [None]:
heart.iloc[[590,357,446]]# Try to get the entire details of a selected row

In [None]:
heart.columns # Display the names of the columns
# Advisable to change the names of the columns especially when there is no clarity in the names or when there are spaces in the names

In [None]:
heart.rename(columns={ 'HeartDisease':'Target', 'FastingBS':'Fasting_Blood_Sugar'},inplace=True)# Renames the columns

In [None]:
heart.sort_values(by="MaxHR", ascending=False).head()

In [None]:
heart.sort_values(by=["MaxHR", "Cholesterol"], ascending=[False, True]).head()

In [None]:
heart[(heart["Target"] == 0) | (heart["ChestPainType"] == "ASY")]["Cholesterol"].max() #

In [None]:
heart.info()# Gives information on Missing values, Data Types
# Check whether any of the columns differ in their data type from what is typically expected

In [None]:
heart["Target"] = heart["Target"].astype("object") # CHange the data type of the object
heart["Fasting_Blood_Sugar"] = heart["Fasting_Blood_Sugar"].astype("object")

In [None]:
heart.describe()# Descriptive Statistics of the columns (Numerical columns)

In [None]:
heart.describe(include=['O']) # Basic summary of categorical columns

In [None]:
heart.nunique() # Unique number of values in each column

In [None]:
heart.apply(np.max)

In [None]:
heart.skew()#symmetricity of the data

In [None]:
heart.kurtosis() # Peaked nature of the data ..The deviation of the data from normality

In [None]:
# To be covered
heart["MaxHR"].apply(lambda x:(x^3-x^2)/(x+1)).head()

In [None]:
cat_columns = heart.select_dtypes(include='object').columns
num_columns = heart.select_dtypes(exclude='object').columns

# Display the Frequency of each of the categories in the cateogircal columns
for var in cat_columns: 
    
    print(heart[var].value_counts())

In [None]:
for var in cat_columns: 
    
    print(heart[var].value_counts(normalize=True))
    print(heart[[var, "Target"]].groupby(var, as_index=False).mean().sort_values(by='Target', ascending=False))

In [None]:
cat_columns

In [None]:
for var in num_columns: 
    
    print("The descriptive statistics for ",var, " are ",stats.describe(heart[var]))
    

In [None]:
# Errors replacing them with blank values (Data Validation)

heart['RestingBP'][heart['RestingBP']==0] = np.nan
heart['Cholesterol'][heart['Cholesterol'] == 0] = np.nan
heart['Oldpeak'][heart['Oldpeak'] < 0] = np.nan
#heart['Oldpeak'] = np.abs(heart['Oldpeak'])
heart['Age'][heart['Age'] > 100] = np.nan

#Winsorization
# Adjust to the minimum or the maximum value

In [None]:
# Any category with less than 20 occurrences, I will replace them with blanks

for var in cat_columns: 
    
    insuf_Values = heart[var].value_counts()[heart[var].value_counts()<20].reset_index()
    heart[var][heart[var].isin(insuf_Values["index"])] = np.nan


In [None]:
# Count the number of missing values ine ach column
heart.isnull().sum().sort_values(ascending=False)

In [None]:
# In case you want to delete a column
#heart.drop(["Cholesterol"],axis=1, inplace = True)

In [None]:
# Replace blanks with minimum value
heart['Oldpeak'].fillna(heart['Oldpeak'].min(),inplace = True)
heart['Cholesterol'].fillna(heart['Cholesterol'].median(),inplace = True)

In [None]:
heart['ChestPainType'].value_counts().idxmax()

In [None]:
# Replacing a blank in a categorical variable with the mode of that category
heart["ChestPainType"].fillna(heart['ChestPainType'].value_counts().idxmax(), inplace=True)

In [None]:
# Remove the missing rows
heart.dropna(axis=0, inplace = True)

In [None]:
# Create new Features
heart['Chol_BP'] = heart['Cholesterol'] / heart['RestingBP']

round(heart['Chol_BP'].mean(),2)

In [None]:
# Usage of Lambda (Alternative defining function separately and use it separately)
print('ST_Slope - Total Count')
heart['ST_Slope'].apply(lambda num:num[:1]).value_counts()[:3]

In [None]:
# One categorical One numerical column - Descriptive Statistics across the categories
for var in cat_columns: 
    print(heart.groupby(var).mean())

In [None]:
print("Variable ----Maximum Value ----- Maximum Value Location")
for var in num_columns: 
    print(var, "-----", heart[var].max(), "-----",heart[var].idxmax())    

In [None]:
#Another Filter Example
heart[heart['Cholesterol'].between(100,200,inclusive=True)]

In [None]:
from itertools import product
cat1 = heart[cat_columns]
cat2 = heart[cat_columns]
cat_var_prod = list(product(cat1,cat2, repeat = 1))
cat_var_prod

In [None]:
# Cross Tabulations of the categorical variables
for var1 in cat_columns:
    for var2 in cat_columns:
        print(pd.crosstab(heart[var1], heart[var2], normalize=True))


In [None]:
import scipy.stats as ss
result = []
for i in cat_var_prod:
    if i[0] != i[1]:
        result.append((i[0],i[1],list(ss.chi2_contingency(pd.crosstab(
                            heart[i[0]], heart[i[1]])))[1]))
result
chi_test_output = pd.DataFrame(result, columns = ["var1", "var2", "coef"])
chi_test_output

In [None]:
# Correlation between two numerical variables
heart.corr()

In [None]:
def Zscore_outlier(df):
    out=[]
    m = np.mean(df)
    sd = np.std(df)
    for i in df: 
        z = (i-m)/sd
        if np.abs(z) > 3: 
            out.append(i)
    print("Outliers:",out)
    
for var in num_columns: 
    print(Zscore_outlier(heart[var]))

In [None]:
def iqr_outliers(df):
    out=[]
    q1 = df.quantile(0.25)
    q3 = df.quantile(0.75)
    iqr = q3-q1
    Lower_tail = q1 - 1.5 * iqr
    Upper_tail = q3 + 1.5 * iqr
    for i in df:
        if i > Upper_tail or i < Lower_tail:
            out.append(i)
    print("Outliers:",out)
    
for var in num_columns: 
    print(iqr_outliers(heart[var]))

In [None]:
def Winsorization_outliers(df):
    out=[]
    q1 = df.quantile(0.01)
    q3 = df.quantile(0.99)
    for i in df:
        if i > q3 or i < q1:
            out.append(i)
    print("Outliers:",out)

for var in num_columns: 
    print(Winsorization_outliers(heart[var]))   

In [None]:
def ZRscore_outlier(df):
    out=[]
    med = np.median(df)
    ma = stats.median_absolute_deviation(df)
    for i in df: 
        z = (0.6745*(i-med))/ (np.median(ma))
        if np.abs(z) > 3: 
            out.append(i)
    print("Outliers:",out)
    
for var in num_columns: 
    print(ZRscore_outlier(heart[var]))

In [None]:
def grubbs_test(x):
    n = len(x)
    mean_x = np.mean(x)
    sd_x = np.std(x)
    numerator = max(abs(x-mean_x))
    g_calculated = numerator/sd_x
    print("Grubbs Calculated Value:",g_calculated)
    t_value = stats.t.ppf(1 - 0.05 / (2 * n), n - 2)
    g_critical = ((n - 1) * np.sqrt(np.square(t_value))) / (np.sqrt(n) * np.sqrt(n - 2 + np.square(t_value)))
    print("Grubbs Critical Value:",g_critical)
    if g_critical > g_calculated:
        print("No outliers\n")
    else:
        print("Outliers Exist\n")
        
for var in num_columns: 
    print(grubbs_test(heart[var]))

In [None]:
from sklearn.cluster import DBSCAN
def DB_outliers(df):
    outlier_detection = DBSCAN(eps = 2, metric='euclidean', min_samples = 5)
    clusters = outlier_detection.fit_predict(df.values.reshape(-1,1))
    data = pd.DataFrame()
    data['cluster'] = clusters
    print(data['cluster'].value_counts().sort_values(ascending=False))

for var in num_columns: 
    print(DB_outliers(heart[var]))

In [None]:
from sklearn.ensemble import IsolationForest
def Iso_outliers(df):
    iso = IsolationForest(random_state = 1, contamination= 'auto')
    preds = iso.fit_predict(df.values.reshape(-1,1))
    data = pd.DataFrame()
    data['cluster'] = preds
    print(data['cluster'].value_counts().sort_values(ascending=False))
    
for var in num_columns: 
    Iso_outliers(heart[var])

## Visualizations

In [None]:
for var in num_columns: 
    plt.figure()
    sns.histplot(data = heart, x = var, kde = True, color='teal', alpha=0.6)

In [None]:
plt.figure(figsize=(15,8))
for var in cat_columns: 
    plt.figure()
    sns.countplot(data = heart, x = var,   hue = 'Target')

In [None]:
sns.pairplot(heart, hue='Target')

In [None]:
for var in num_columns: 
    for i in cat_columns:
        plt.figure()
        sns.catplot(x=i,y=var,data=heart,kind='box',col='Target',palette='Dark2')   

In [None]:
sns.heatmap(heart.corr(), annot=True)

In [None]:
import statsmodels.api as sm
def qq_plots(df):
    plt.figure(figsize=(10, 4))
    sm.qqplot(df,line='s')
    plt.title("Normal QQPlot")
    plt.show()
for var in num_columns: 
    qq_plots(heart[var])
    

## Scikit-Learn

In [None]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, MaxAbsScaler
from sklearn.preprocessing import PowerTransformer, QuantileTransformer, OneHotEncoder

In [None]:
X = heart.drop(["Target"],axis = 1)
y = heart['Target']


In [None]:
categorical_columns = X.select_dtypes(include='object').columns
numerical_columns = X.select_dtypes(exclude='object').columns

In [None]:
X = pd.get_dummies(data = X, prefix = categorical_columns, prefix_sep='_',
               columns = categorical_columns,
               drop_first =True,
              dtype='int8')
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [None]:
col_names = X_train.columns

In [None]:
# You can perform Box Cox or Yeo-Johnson (Non Normal transformation)
bctrans = PowerTransformer(method = 'yeo-johnson').fit(X_train)
X_train = pd.DataFrame(bctrans.transform(X_train),columns = col_names)
X_test = pd.DataFrame(bctrans.transform(X_test),columns = col_names)
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train),columns = col_names)
X_test = pd.DataFrame(scaler.transform(X_test),columns = col_names)
X_train.head()

In [None]:
log_model = LogisticRegression(penalty = 'none')
log_model.fit(X_train,y_train)
log_model.coef_

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,plot_confusion_matrix, cohen_kappa_score

In [None]:
y_pred = log_model.predict(X_test)

In [None]:
y_pred_proba = log_model.predict_proba(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
y_test.value_counts()

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
cohen_kappa_score(y_test,y_pred)

In [None]:
plot_confusion_matrix(log_model,X_test,y_test)

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
log_model.predict_proba(X_train.iloc[0].values.reshape(1, -1))

In [None]:
log_model.predict(X_train.iloc[0].values.reshape(1, -1))

In [None]:
from sklearn.metrics import precision_recall_curve,plot_precision_recall_curve,plot_roc_curve, roc_auc_score, roc_curve

In [None]:
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba[:,1], pos_label = 1)
fpr, tpr, thr

In [None]:
roc_auc_score(y_test, y_pred_proba[:,1])

In [None]:
plot_precision_recall_curve(log_model,X_test,y_test)

In [None]:
plot_roc_curve(log_model,X_test,y_test)

In [None]:
from sklearn.model_selection import cross_val_score
scores_accuracy = cross_val_score(log_model, X, y, cv=10, scoring='accuracy')
scores_log_loss = cross_val_score(log_model, X, y, cv=10, scoring='neg_log_loss')
scores_auc = cross_val_score(log_model, X, y, cv=10, scoring='roc_auc')

In [None]:
scores_accuracy,scores_log_loss,scores_auc

In [None]:
scores_accuracy.mean(),scores_log_loss.mean(), scores_auc.mean()

In [None]:
logreg100 = LogisticRegression(C=100,  random_state=101)
logreg100.fit(X_train, y_train)
print(logreg100.score(X_train, y_train))
print(logreg100.score(X_test, y_test))

In [None]:
logreg001 = LogisticRegression(C=0.01,  random_state=101)
logreg001.fit(X_train, y_train)
print(logreg001.score(X_train, y_train))
print(logreg001.score(X_test, y_test))

In [None]:
# Hyper parameter tuning
from sklearn.model_selection import GridSearchCV
logreg = LogisticRegression()
parameters = [{'penalty':['l1','l2']}, 
              {'C':[1, 10, 100, 1000]}]
grid_search = GridSearchCV(estimator = logreg,  
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 5,
                           verbose=0)
grid_search.fit(X_train, y_train)


In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)
print(grid_search.cv_results_)

In [None]:
from sklearn.feature_selection import RFECV, RFE

In [None]:
X_train.shape

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2
X_new = SelectKBest(chi2, k=6).fit_transform(X_train, y_train)
print(X_new.head())
X_new1 = SelectPercentile(percentile = 25).fit_transform(X_train, y_train)
print(X_new1.head())

In [None]:
# cross-validated score for RFE object
# The "accuracy" scoring is proportional to the number of correct classifications
rfecv = RFECV(estimator=LogisticRegression(), step=1, cv=10, scoring='accuracy',min_features_to_select = 5)
rfecv.fit(X_train,y_train)

print("Optimal number of features: %d" % rfecv.n_features_)
print('Selected features: %s' % list(X.columns[rfecv.support_]))
rfecv.grid_scores_

In [None]:
plt.figure(figsize=(10,6))
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()

In [None]:
#Selected_features = ['Fasting_Blood_Sugar', 'Chol_BP', 'Sex_M', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ExerciseAngina_Y', 'ST_Slope_Flat', 'ST_Slope_Up']
#X_train = X_train[Selected_features]
#X_test = X_test[Selected_features]
plt.subplots(figsize=(16, 10))
sns.heatmap(X_train.corr(), annot=True, cmap="RdYlGn")
plt.show()

# Using statsmodels

In [None]:
y_test = pd.DataFrame(y_test)
print(type(y_test))
y_train = pd.DataFrame(y_train)
print(type(y_train))
print(type(X_train))

In [None]:
X_train.head()

In [None]:
import statsmodels.api as sm
y_train=y_train.values.reshape(-1,1)
log_reg = sm.Logit(y_train, X_train)
LogReg_Model = log_reg.fit()


In [None]:
print(LogReg_Model.summary())

In [None]:
LogReg_Model.pvalues[0:17].plot.bar()
plt.axhline(y = 0.05);

In [None]:
LogReg_Model.params[0:16].plot.bar();

In [None]:
quant_df_main = {}
for i in LogReg_Model.params.to_dict().keys():
    if LogReg_Model.pvalues[i] < 0.05:
        quant_df_main[i] = LogReg_Model.params[i]
    else:
        continue
        
quant_df_main

In [None]:
quant_df_main_odds = {k : np.exp(v) for k, v in quant_df_main.items()}
quant_df_main_odds

In [None]:
X_test.head()

In [None]:
yhat = LogReg_Model.predict(X_test)
prediction = list(map(round, yhat))

In [None]:
# confusion matrix
cm = confusion_matrix(y_test, prediction)
cm

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
vif = pd.DataFrame()
vif['Features'] = X_train.columns
vif['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif['VIF'] = round(vif['VIF'], 2)
vif = vif.sort_values(by = "VIF", ascending = False)
vif[vif["VIF"]>10]["Features"]

In [None]:
X_train = X_train.drop(vif[vif["VIF"]>10]["Features"],axis = 1)

In [None]:
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
X_test_trans = quantile_transformer.transform(X_test)

quantile_transformer = preprocessing.QuantileTransformer(output_distribution='normal', random_state=0)
X_trans = quantile_transformer.fit_transform(X_train)


In [None]:
X_trans

## KNN Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train,y_train)

In [None]:
knn_model.score(X_test,y_test)

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

In [None]:
confusion_matrix(y_test,y_pred)

In [None]:
cohen_kappa_score(y_test,y_pred)

In [None]:
y_pred_proba = knn_model.predict_proba(X_test)

In [None]:
roc_auc_score(y_test, y_pred_proba[:,1])

In [None]:
np.round(np.sqrt(X_train.shape[0]),0)

In [None]:
neighbors = np.arange(1, int(np.round(np.sqrt(X_train.shape[0]),0)))
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))


for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train) 
   
    train_accuracy[i] = knn.score(X_train, y_train)
    test_accuracy[i] = knn.score(X_test, y_test)
    
    # Generate plot
plt.plot(neighbors, test_accuracy, label = 'Testing dataset Accuracy')
plt.plot(neighbors, train_accuracy, label = 'Training dataset Accuracy')
 
plt.legend()
plt.xlabel('n_neighbors')
plt.ylabel('Accuracy')
plt.show()
test_accuracy

In [None]:
n_neighbors = np.arange(1, int(np.round(np.sqrt(X_train.shape[0]),0)))
weights = ['uniform', 'distance']
leaf_size = [15,20,25,30,35,40,45,50]
p = [1,2,3]
hyperparams = {'weights': weights, 'n_neighbors': n_neighbors, 'leaf_size':leaf_size, 'p':p}
gd=GridSearchCV(estimator = KNeighborsClassifier(), param_grid = hyperparams, verbose=True, 
                cv=5, scoring = "roc_auc")
gd.fit(X_train, y_train)
print(gd.best_score_)
print(gd.best_estimator_)


In [None]:
from sklearn.model_selection import RandomizedSearchCV
n_neighbors = np.arange(1, int(np.round(np.sqrt(X_train.shape[0]),0)))
weights = ['uniform', 'distance']
leaf_size = [15,20,25,30,35,40,45,50]
p = [1,2,3]
hyperparams = {'weights': weights, 'n_neighbors': n_neighbors, 'leaf_size':leaf_size, 'p':p}
gd=RandomizedSearchCV(estimator = KNeighborsClassifier(), param_distributions = hyperparams, verbose=True, n_iter = 30,
                cv=5, scoring = "roc_auc",  random_state = 100)
gd.fit(X_train, y_train)
print(gd.best_score_)
print(gd.best_estimator_)


In [None]:
gd.cv_results_

# Create a PipeLine that contains both a StandardScaler and a KNN model

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [None]:
scaler = StandardScaler()
knn = KNeighborsClassifier()

In [None]:
operations = [('scaler',scaler),('knn',knn)]

In [None]:
from sklearn.pipeline import Pipeline
pipe = Pipeline(operations)

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
scale_mean = [True, False]
scale_sd = [True, False]
k_values = list(range(10,20))
p_val = [1,2,3]

In [None]:
param_grid = {'scaler__with_mean':scale_mean,'scaler__with_std':scale_sd,'knn__n_neighbors': k_values, 'knn__p':p_val}
param_grid

In [None]:
full_cv_classifier = GridSearchCV(pipe,param_grid,cv=5,scoring='accuracy')

In [None]:
full_cv_classifier.fit(X_train,y_train)

In [None]:
full_cv_classifier.best_estimator_.get_params()

In [None]:
full_cv_classifier.cv_results_['mean_test_score']

In [None]:
pred = full_cv_classifier.predict(X_test)

In [None]:
confusion_matrix(y_test,pred)

In [None]:
cohen_kappa_score(y_test,pred)

In [None]:
print(classification_report(y_test,pred))

# Support Vector Machines

In [None]:
from sklearn.svm import SVC
from sklearn.svm import NuSVC

In [None]:
model = SVC(kernel='linear', C=1, probability=True, gamma = "auto")
model.fit(X_train, y_train)

In [None]:
Y_pred = model.predict(X_test)
acc_svc = round(model.score(X_train, y_train) * 100, 2)
acc_svc

In [None]:
model = SVC(kernel='linear', C=0.05)
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
acc_svc = round(model.score(X_train, y_train) * 100, 2)
acc_svc

In [None]:
model = SVC(kernel='rbf', C=1)
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
acc_svc = round(model.score(X_train, y_train) * 100, 2)
acc_svc

In [None]:
model = SVC(kernel='sigmoid', C=1)
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
acc_svc = round(model.score(X_train, y_train) * 100, 2)
acc_svc

In [None]:
model = SVC(kernel='poly', C=1, degree = 1)
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
acc_svc = round(model.score(X_train, y_train) * 100, 2)
acc_svc

In [None]:
model = SVC(kernel='poly', C=1, degree = 2)
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
acc_svc = round(model.score(X_train, y_train) * 100, 2)
acc_svc

In [None]:
model = SVC(kernel='rbf', C=1,gamma=0.01)
model.fit(X_train, y_train)
Y_pred = model.predict(X_test)
acc_svc = round(model.score(X_train, y_train) * 100, 2)
acc_svc

In [None]:
from sklearn.model_selection import GridSearchCV
SVMC = SVC(probability=True)
svc_param_grid = {'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
                  'gamma': [ 0.001, 0.01, 0.1, 1],
                  'degree':[2,3,4,5],
                  'C': [0.01,0.1,1, 10, 50, 100,200,300, 1000]}

#gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=5, scoring="accuracy",  verbose = 1)
gsSVMC=RandomizedSearchCV(estimator = SVMC, param_distributions = svc_param_grid, verbose=True, n_iter = 20,
                cv=5, scoring = "roc_auc",  random_state = 100)

gsSVMC.fit(X_train,y_train)

SVMC_best = gsSVMC.best_estimator_

# Best score
gsSVMC.best_score_

In [None]:
gsSVMC.best_estimator_.get_params()

In [None]:
gsSVMC.cv_results_['mean_test_score']

In [None]:
pred = gsSVMC.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

In [None]:
cohen_kappa_score(y_test,pred)

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
model = DecisionTreeClassifier()

In [None]:
model.fit(X_train,y_train)

In [None]:
model.decision_path(X_train)

In [None]:
base_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix,classification_report,plot_confusion_matrix

In [None]:
confusion_matrix(y_test,base_pred)

In [None]:
plot_confusion_matrix(model,X_test,y_test)

In [None]:
print(classification_report(y_test,base_pred))

In [None]:
model.feature_importances_

In [None]:
pd.DataFrame(index=X.columns,data=model.feature_importances_,columns=['Feature Importance'])

In [None]:
from sklearn.tree import plot_tree

In [None]:
plt.figure(figsize=(20,15),dpi=300)
plot_tree(model,filled=True,feature_names=X_train.columns);

In [None]:
def report_model(model):
    model_preds = model.predict(X_test)
    print(classification_report(y_test,model_preds))
    print('\n')
    plt.figure(figsize=(20,15),dpi=300)
    plot_tree(model,filled=True,feature_names=X.columns);

In [None]:
pruned_tree = DecisionTreeClassifier(max_depth=3)
pruned_tree.fit(X_train,y_train)

In [None]:
report_model(pruned_tree)

In [None]:
pruned_tree = DecisionTreeClassifier(max_leaf_nodes=3)
pruned_tree.fit(X_train,y_train)

In [None]:
report_model(pruned_tree)

In [None]:
entropy_tree = DecisionTreeClassifier(criterion='entropy')
entropy_tree.fit(X_train,y_train)

In [None]:
report_model(entropy_tree)

In [None]:
from sklearn.model_selection import GridSearchCV
D_tree = DecisionTreeClassifier()
D_tree_param_grid = {'criterion': ['gini'], 
                  'max_depth': [ 'none', 4,5,6],
                  'min_samples_split':[2,3,4],
                  'min_samples_leaf': [1,2,3]}

#gsSVMC = GridSearchCV(SVMC,param_grid = svc_param_grid, cv=5, scoring="accuracy",  verbose = 1)
#gsDtree=RandomizedSearchCV(estimator = D_tree, param_distributions = D_tree_param_grid, verbose=True, n_iter = 20,
#                cv=5, scoring = "roc_auc",  random_state = 100)
gsDtree=GridSearchCV(estimator = D_tree, param_grid = D_tree_param_grid, verbose=True,
                cv=5, scoring = "roc_auc")

gsDtree.fit(X_train,y_train)

gsDtree_best = gsDtree.best_estimator_

# Best score
gsDtree.best_score_

In [None]:
gsDtree.best_estimator_.get_params()

In [None]:
gsDtree.cv_results_['mean_test_score']

In [None]:
pred = gsDtree.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

In [None]:
cohen_kappa_score(y_test,pred)

## Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Use 10 random trees
model = RandomForestClassifier(n_estimators=200,max_features='auto',criterion='entropy',random_state=100)
model

In [None]:
XYZ = model.fit(X_train,y_train)

In [None]:
preds = model.predict(X_test)

In [None]:
confusion_matrix(y_test,preds)

In [None]:
plot_confusion_matrix(model,X_test,y_test)

In [None]:
model.feature_importances_

In [None]:
pd.DataFrame(index=X.columns,data=model.feature_importances_,columns=['Feature Importance'])

In [None]:
test_error = []

for n in range(25,1000,25):
    # Use n random trees
    model = RandomForestClassifier(n_estimators=n,max_features='auto')
    model.fit(X_train,y_train)
    test_preds = model.predict(X_test)
    test_error.append(1-accuracy_score(test_preds,y_test))


In [None]:
plt.plot(range(25,1000,25),test_error,label='Test Error')
plt.legend()

In [None]:
test_error

In [None]:
from sklearn.model_selection import GridSearchCV
RFC = RandomForestClassifier(random_state=100)
param_grid = {'n_estimators':[200,300,400],
             'max_features':[2,3,4,5,6],
             'bootstrap':[True, False],
             'oob_score':[True, False],
             'criterion':['gini','entropy'],
              'min_samples_split':[2,5,8,12],
              'min_samples_leaf': [1,2,3,4,5] } 
gsRFtree=RandomizedSearchCV(estimator = RFC, param_distributions = param_grid, verbose=True, n_iter = 20,
                cv=5, scoring = "accuracy")

gsRFtree.fit(X_train,y_train)

gsDtree_best = gsRFtree.best_estimator_

# Best score
gsRFtree.best_score_

In [None]:
print(gsRFtree.best_estimator_.get_params())
print(gsRFtree.cv_results_['mean_test_score'])
pred = gsRFtree.predict(X_test)
print(cohen_kappa_score(y_test,pred))

In [None]:
gsRFtree.cv_results_

In [None]:
print(classification_report(y_test,pred))

In [None]:
plot_confusion_matrix(gsRFtree,X_test,y_test)

# Boosted Trees

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
param_grid = {"n_estimators":[10,20,40,80,160,320],'max_depth':[3,4,5,6], 'max_features':['auto']}

In [None]:
gb_model = GradientBoostingClassifier()

In [None]:
grid = GridSearchCV(gb_model,param_grid)

In [None]:
grid.fit(X_train,y_train)

In [None]:
grid.best_params_

In [None]:
predictions = grid.predict(X_test)

In [None]:
plot_confusion_matrix(grid,X_test,y_test)

In [None]:
print(classification_report(y_test,predictions))

In [None]:
grid.best_estimator_.feature_importances_

In [None]:
feat_import = grid.best_estimator_.feature_importances_

In [None]:
imp_feats = pd.DataFrame(index=X.columns,data=feat_import,columns=['Importance'])
imp_feats

In [None]:
imp_feats.sort_values("Importance",ascending=False)

In [None]:
imp_feats.describe().transpose()

In [None]:
imp_feats = imp_feats[imp_feats['Importance'] > 0.0005]

In [None]:
imp_feats.sort_values('Importance')

In [None]:
plt.figure(figsize=(14,6),dpi=200)
sns.barplot(data=imp_feats.sort_values('Importance'),x=imp_feats.sort_values('Importance').index,y='Importance')
plt.xticks(rotation=90);

In [None]:
from sklearn.model_selection import GridSearchCV
model = GradientBoostingClassifier()
param_grid = {'n_estimators':[10,20,40,80,160,320],
              'learning_rate':[0.01,0.05,0.1,0.2, 0.3],
              'min_samples_split':[2,5,8,12],
              'min_samples_leaf': [1,2,3,4,5] ,
              'max_depth':[2,3,4,5],
              'max_features':['auto', 'sqrt', 'log2']} 
gsgbctree=RandomizedSearchCV(estimator = gbc, param_distributions = param_grid, verbose=True, n_iter = 50,
                cv=5, scoring = "accuracy",  random_state = 100)

gsgbctree.fit(X_train,y_train)

gsDtree_best = gsgbctree.best_estimator_

# Best score
gsgbctree.best_score_

In [None]:
gsgbctree.best_estimator_

In [None]:
pip install xgboost

## XGBOOST

In [None]:
from xgboost import XGBClassifier

In [None]:
model = XGBClassifier(use_label_encoder=False, 
                      booster='gbtree', # boosting algorithm to use, default gbtree, othera: gblinear, dart
                      n_estimators=100, # number of trees, default = 100
                      eta=0.3, # this is learning rate, default = 0.3
                      max_depth=6, # maximum depth of the tree, default = 6
                      gamma = 0, # used for pruning, if gain < gamma the branch will be pruned, default = 0
                      reg_lambda = 1, # regularization parameter, defautl = 1
                      #min_child_weight=0 # this refers to Cover which is also responsible for pruning if not set to 0
                     )


In [None]:
clf = model.fit(X_train, y_train)

In [None]:
# Predict class labels on training data
pred_labels_tr = model.predict(X_train)
# Predict class labels on a test data
pred_labels_te = model.predict(X_test)

In [None]:
score_te = model.score(X_test, y_test)
score_te

In [None]:
score_tr = model.score(X_train, y_train)
score_tr

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [None]:
param_grid = {'eta':[0,0.2,0.4,0.6,0.8,1],'gamma':[0,1,2,4,8,16],"n_estimators":[1,5,10,20,40,100],'min_child_weight':[0,1,2,4,8,16],'max_depth':range(2,10),'subsample':[0,0.2,0.4,0.6,0.8,1]}


In [None]:
grid = RandomizedSearchCV(model,param_distributions=param_grid,scoring = 'accuracy',n_iter = 50)
grid.fit(X_train,y_train)
grid.best_params_


In [None]:
y_pred = grid.predict(X_test)
[round(accuracy_score(y_test,y_pred),3),round(cohen_kappa_score(y_test,y_pred),3),round(roc_auc_score(y_test,y_pred),3)]


## Naive bayes

In [None]:
from sklearn.naive_bayes import BernoulliNB
classifier = BernoulliNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
plot_confusion_matrix(classifier,X_test,y_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
param_grid = {'alpha':[0,0.25,0.5,0.75,1]}

In [None]:
grid = GridSearchCV(classifier,param_grid=param_grid)
grid.fit(X_train,y_train)
grid.best_params_

In [None]:
y_pred = grid.predict(X_test)
[round(accuracy_score(y_test,y_pred),3),round(cohen_kappa_score(y_test,y_pred),3),round(roc_auc_score(y_test,y_pred),3)]


In [None]:
plot_confusion_matrix(classifier,X_test,y_test)

In [None]:
from sklearn.linear_model import Perceptron

In [None]:
model = Perceptron()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
plot_confusion_matrix(perceptron,X_test,y_test)

In [None]:
param_grid = {'penalty':['l2','l1','elasticnet'],
             'alpha':[0.001,0.01,0.1,1,10,100],
              'l1_ratio':[0.05,0.15,0.25,0.5,0.75,0.85,0.95]}


In [None]:
grid = GridSearchCV(model,param_grid=param_grid)
grid.fit(X_train,y_train)
grid.best_params_

In [None]:
y_pred = grid.predict(X_test)
[round(accuracy_score(y_test,y_pred),3),round(cohen_kappa_score(y_test,y_pred),3),round(roc_auc_score(y_test,y_pred),3)]


In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
model = MLPClassifier()
model.fit(X_train, y_train)
y_pred = perceptron.predict(X_test)

In [None]:
plot_confusion_matrix(model,X_test,y_test)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV


In [None]:
param_grid = {'activation':['identity','logistic','tanh','relu'],
             'learning_rate':['constant','invscaling','adaptive'],
              'learning_rate_init':[0.001, 0.005, 0.01],
              'shuffle':[True, False]}

In [None]:
grid = GridSearchCV(model,param_grid=param_grid)
grid.fit(X_train,y_train)
grid.best_params_

In [None]:
y_pred = grid.predict(X_test)
[round(accuracy_score(y_test,y_pred),3),round(cohen_kappa_score(y_test,y_pred),3),round(roc_auc_score(y_test,y_pred),3)]


# Implementing an end-end project using Pipeline

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import  MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import cohen_kappa_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

In [None]:
def GetBasedModel():
    basedModels = []
    basedModels.append(('LR'   , LogisticRegression()))
    basedModels.append(('LDA'  , LinearDiscriminantAnalysis()))
    basedModels.append(('KNN'  , KNeighborsClassifier()))
    basedModels.append(('CART' , DecisionTreeClassifier()))
    basedModels.append(('NB'   , GaussianNB()))
    basedModels.append(('SVM'  , SVC(probability=True)))
    basedModels.append(('AB'   , AdaBoostClassifier()))
    basedModels.append(('GBM'  , GradientBoostingClassifier()))
    basedModels.append(('RF'   , RandomForestClassifier()))
    basedModels.append(('NN'   , MLPClassifier()))
    
    return basedModels


In [None]:
def BasedLine2(X_train, y_train,models):
    # Test options and evaluation metric
    num_folds = 10
    scoring = 'accuracy'

    results = []
    names = []
    for name, model in models:
        kfold = StratifiedKFold(n_splits=num_folds)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
    return names, results

In [None]:
models = GetBasedModel()
names,results = BasedLine2(X_train, y_train,models)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler


def GetScaledModel(nameOfScaler):
    
    if nameOfScaler == 'standard':
        scaler = StandardScaler()
    elif nameOfScaler =='minmax':
        scaler = MinMaxScaler()

    pipelines = []
    pipelines.append((nameOfScaler+'LR'  , Pipeline([('Scaler', scaler),('LR'  , LogisticRegression())])))
    pipelines.append((nameOfScaler+'LDA' , Pipeline([('Scaler', scaler),('LDA' , LinearDiscriminantAnalysis())])))
    pipelines.append((nameOfScaler+'KNN' , Pipeline([('Scaler', scaler),('KNN' , KNeighborsClassifier())])))
    pipelines.append((nameOfScaler+'CART', Pipeline([('Scaler', scaler),('CART', DecisionTreeClassifier())])))
    pipelines.append((nameOfScaler+'NB'  , Pipeline([('Scaler', scaler),('NB'  , GaussianNB())])))
    pipelines.append((nameOfScaler+'SVM' , Pipeline([('Scaler', scaler),('SVM' , SVC())])))
    pipelines.append((nameOfScaler+'AB'  , Pipeline([('Scaler', scaler),('AB'  , AdaBoostClassifier())])  ))
    pipelines.append((nameOfScaler+'GBM' , Pipeline([('Scaler', scaler),('GMB' , GradientBoostingClassifier())])  ))
    pipelines.append((nameOfScaler+'RF'  , Pipeline([('Scaler', scaler),('RF'  , RandomForestClassifier())])  ))
    pipelines.append((nameOfScaler+'NN'  , Pipeline([('Scaler', scaler),('NN'  , MLPClassifier())])  ))


    return pipelines 

In [None]:
models = GetScaledModel('minmax')
names,results = BasedLine2(X_train, y_train,models)
