### Importing Required Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from warnings import filterwarnings
filterwarnings('ignore')

### Importing Data Set

In [None]:
df = pd.read_excel("Sports Data.xlsx",sheet_name = "Sports data for DSBA")
df.head()

In [None]:
df.tail()

In [None]:
df2 = df.copy()
df2.head()

### Finding the shape, Info, dimensions and Desciprtion of the data set

In [None]:
print("No. of rows in the dataset", df.shape[0])
print("No. of columns in the dataset", df.shape[1])

In [None]:
df.info()

#### Descirptive stats

In [None]:
df.describe().T

In [None]:
df[df['Extra_bowls_bowled']==40.0]['Result']

In [None]:
df.describe(include='all').T

### Checking for null values in the data set

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.isnull().sum().sum()

In [None]:
df.size

In [None]:
tot_missing_values = df.isnull().sum().sum()/df.size
print("Percentage of missing values in the dataset is:", (tot_missing_values*100))

### Dropping Game_Number variable

In [None]:
df.drop('Game_number', axis=1, inplace=True)
df.head()

In [None]:
df['Result'].unique()

Match format type T20 has two ways of entry: T20 and 20-20

First_Selection has two ways of entry for batting: Batting and Bat

Player_scored_Zero has two ways of entry for three members: 3 an Three

Player_Highest_Wicket has two ways of entry for three : 3 an Three

#### Renaming the variables

In [None]:
df['Match_format'] = np.where(df['Match_format']=='20-20','T20',df['Match_format'])
df['First_selection'] = np.where(df['First_selection']=='Bat','Batting',df['First_selection'])
df['Players_scored_zero'] = np.where(df['Players_scored_zero']=='Three',3, df['Players_scored_zero'])
df['player_highest_wicket'] = np.where(df['player_highest_wicket']=='Three', 3, df['player_highest_wicket'])

In [None]:
for col in df.columns:
    print(col.upper())
    print(df[col].unique())
    print("\n")

### Univariate Analysis

In [None]:
%matplotlib inline

In [None]:
df['Avg_team_Age'] = df['Avg_team_Age'].fillna(df['Avg_team_Age'].median())

In [None]:
def remove_outliers(col):
    sorted(col)
    Q1, Q3 = np.percentile(col,[25,75])
    IQR = Q3-Q1
    lower_range = Q1 - (1.5*IQR)
    upper_range = Q3 + (1.5*IQR)
    return lower_range, upper_range

In [None]:
lr,ur = remove_outliers(df['Avg_team_Age'])
print(lr,ur)
df['Avg_team_Age'] = np.where(df['Avg_team_Age']< lr, lr, df['Avg_team_Age'])
df['Avg_team_Age'] = np.where(df['Avg_team_Age']> ur, ur, df['Avg_team_Age'])

In [None]:
sns.boxplot(data=df['Avg_team_Age'])

In [None]:
df.isnull().sum()

### Imputing Null values

In [None]:
import statistics
for col in df.columns:
    if(df[col].dtypes == 'object'):
        df[col] = df[col].replace(np.NaN, statistics.mode(df[col]))


In [None]:
for col in df.columns:
    if(df[col].dtypes != 'object'):
        df[col] = df[col].fillna(df[col].median())

In [None]:
df.isnull().sum()

### data set back up

In [None]:
df_new = df.copy()
df_new.head()

In [None]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
from sklearn.preprocessing import LabelEncoder

### Dropping Wicket_keeper_in_team, and Audience_number variable

In [None]:
df.drop(['Wicket_keeper_in_team', 'Audience_number'], axis=1, inplace=True)
df.head()

### Using Label encoder to encode the variables for the Decsision Treee, Random Forest, ANN

In [None]:
LE = LabelEncoder()
for col in df.columns:
    if(df[col].dtype == 'object'):
        df[col] = LE.fit_transform(df[col])
df.head()

In [None]:
df.info()

### splitting data into three datafarames according to the match format
### After label encoding ODI = '0', T20 = '1' and Test = '2'

In [None]:
df_ODI = df[df['Match_format']==0]
df_ODI.head()

In [None]:
df_T20 = df[df['Match_format']==1]
df_T20.head()

In [None]:
df_Test = df[df['Match_format']==2]
df_Test.head()

### We are creating individual models for each format
### so there is no significance for Match_format variable. Therefore dropping them from respective data sets

In [None]:
temp_list = [df_ODI,df_T20,df_Test]
for item in temp_list:
    item.drop('Match_format',axis=1, inplace=True)
    print(item.head())
    print('\n')

In [None]:
for item in temp_list:
    print(item['Result'].value_counts(normalize=True))
    print('\n')    

#### only ODI data set little unbalance data with ratios as 86% win and 13% loss

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_ODI = df_ODI.drop('Result', axis=1, inplace=False)
Y_ODI = df_ODI['Result']
print(X_ODI.head())
Y_ODI.head()

In [None]:
X_T20 = df_T20.drop('Result', axis=1, inplace=False)
Y_T20 = df_T20['Result']
print(X_T20.head())
Y_T20.head()

In [None]:
X_Test = df_Test.drop('Result', axis=1, inplace=False)
Y_Test = df_Test['Result']
print(X_Test.head())
Y_Test.head()

#### Creating model for ODI format and splitting the data accordingly

In [None]:
X = df.drop('Result',axis=1,inplace=False)
Y = df['Result']
print(X.head())
Y.head()

### Building model for ODI Format

In [None]:
X_train, X_test, train_labels, test_labels = train_test_split(X_ODI, Y_ODI, test_size=.30, random_state=5)

In [None]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score,roc_curve,classification_report,confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
#Building a Decision Tree Classifier
param_grid_dtcl = {
    'criterion': ['gini'],
    'max_depth': [10,20,30,50],
    'min_samples_leaf': [50,100,150], 
    'min_samples_split': [150,300,450],
}
dtcl = DecisionTreeClassifier(random_state=1)

grid_search_dtcl = GridSearchCV(estimator = dtcl, param_grid = param_grid_dtcl, cv = 10)

In [None]:
grid_search_dtcl.fit(X_train, train_labels)
print(grid_search_dtcl.best_params_)
best_grid_dtcl = grid_search_dtcl.best_estimator_
best_grid_dtcl

In [None]:
param_grid_dtcl = {
    'criterion': ['gini'],
    'max_depth': [3.5,4.0,4.5, 5.0,5.5],
    'min_samples_leaf': [40, 42, 44,46,48,50,52,54], 
    'min_samples_split': [250, 270, 280, 290, 300,310],
}

dtcl = DecisionTreeClassifier(random_state=1)

grid_search_dtcl = GridSearchCV(estimator = dtcl, param_grid = param_grid_dtcl, cv = 10)

In [None]:
grid_search_dtcl.fit(X_train, train_labels)
print(grid_search_dtcl.best_params_)
best_grid_dtcl = grid_search_dtcl.best_estimator_
best_grid_dtcl

In [None]:
param_grid_dtcl = {
    'criterion': ['gini'],
    'max_depth': [4.85, 4.90,4.95, 5.0,5.05,5.10,5.15],
    'min_samples_leaf': [40, 41, 42, 43, 44], 
    'min_samples_split': [150, 175, 200, 210, 220, 230, 240, 250, 260, 270],
}

dtcl = DecisionTreeClassifier(random_state=1)

grid_search_dtcl = GridSearchCV(estimator = dtcl, param_grid = param_grid_dtcl, cv = 10)

In [None]:
grid_search_dtcl.fit(X_train, train_labels)
print(grid_search_dtcl.best_params_)
best_grid_dtcl = grid_search_dtcl.best_estimator_
best_grid_dtcl

In [None]:
train_char_label = ['no', 'yes']
tree_regularized = open('tree_regularized.dot','w')
dot_data = tree.export_graphviz(best_grid_dtcl, out_file= tree_regularized ,
                                feature_names = list(X_train),
                                class_names = list(train_char_label))

tree_regularized.close()
dot_data

In [None]:
print (pd.DataFrame(best_grid_dtcl.feature_importances_, columns = ["Imp"], 
                    index = X_train.columns).sort_values('Imp',ascending=False))

In [None]:
#Predicting on Training and Test dataset
ytrain_predict_dtcl = best_grid_dtcl.predict(X_train)
ytest_predict_dtcl = best_grid_dtcl.predict(X_test)

In [None]:
ytest_predict_dtcl
ytest_predict_prob_dtcl=best_grid_dtcl.predict_proba(X_test)
ytest_predict_prob_dtcl
pd.DataFrame(ytest_predict_prob_dtcl).head()

### Building Random Forest Classifier

In [None]:
param_grid_rfcl = {
    'max_depth': [4,5],
    'max_features': [2,3],
    'min_samples_leaf': [8,9],
    'min_samples_split': [46,50], 
    'n_estimators': [290]
}

rfcl = RandomForestClassifier(random_state=1)

grid_search_rfcl = GridSearchCV(estimator = rfcl, param_grid = param_grid_rfcl)

In [None]:
grid_search_rfcl.fit(X_train, train_labels)
print(grid_search_rfcl.best_params_)
best_grid_rfcl = grid_search_rfcl.best_estimator_
best_grid_rfcl

In [None]:
#Predicting the Training and Testing data
ytrain_predict_rfcl = best_grid_rfcl.predict(X_train)
ytest_predict_rfcl = best_grid_rfcl.predict(X_test)

In [None]:
#Getting the Predicted Classes and Probs
ytest_predict_rfcl
ytest_predict_prob_rfcl=best_grid_rfcl.predict_proba(X_test)
ytest_predict_prob_rfcl
pd.DataFrame(ytest_predict_prob_rfcl).head()

In [None]:
#Feature Importance via RF
print (pd.DataFrame(best_grid_rfcl.feature_importances_, 
                    columns = ["Imp"], 
                    index = X_train.columns).sort_values('Imp',ascending=False))

### Building a Neural Network Classifier

In [None]:

param_grid_nncl = {
    'hidden_layer_sizes': [50,100,200],
    'max_iter': [2500,3000,4000],
    'solver': ['adam'],
    'tol': [0.01], 
}

nncl = MLPClassifier(random_state=1)

grid_search_nncl = GridSearchCV(estimator = nncl, param_grid = param_grid_nncl, cv = 10)

In [None]:
grid_search_nncl.fit(X_train, train_labels)
grid_search_nncl.best_params_
best_grid_nncl = grid_search_nncl.best_estimator_
best_grid_nncl

In [None]:
#Predicting the Training and Testing data
ytrain_predict_nncl = best_grid_nncl.predict(X_train)
ytest_predict_nncl = best_grid_nncl.predict(X_test)

In [None]:
#Getting the Predicted Classes and Probs
ytest_predict_nncl
ytest_predict_prob_nncl=best_grid_nncl.predict_proba(X_test)
ytest_predict_prob_nncl
pd.DataFrame(ytest_predict_prob_nncl).head()

### CART - AUC and ROC for the training data

In [None]:
probs_cart = best_grid_dtcl.predict_proba(X_train)
# keep probabilities for the positive outcome only
probs_cart = probs_cart[:, 1]
# calculate AUC
cart_train_auc = roc_auc_score(train_labels, probs_cart)
print('AUC: %.3f' % cart_train_auc)
# calculate roc curve
cart_train_fpr, cart_train_tpr, cart_train_thresholds = roc_curve(train_labels, probs_cart)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(cart_train_fpr, cart_train_tpr)

In [None]:
probs_cart = best_grid_dtcl.predict_proba(X_test)
# keep probabilities for the positive outcome only
probs_cart = probs_cart[:, 1]
# calculate AUC
cart_test_auc = roc_auc_score(test_labels, probs_cart)
print('AUC: %.3f' % cart_test_auc)
# calculate roc curve
cart_test_fpr, cart_test_tpr, cart_testthresholds = roc_curve(test_labels, probs_cart)
plt.plot([0, 1], [0, 1], linestyle='--')
# plot the roc curve for the model
plt.plot(cart_test_fpr, cart_test_tpr)

In [None]:
confusion_matrix(train_labels, ytrain_predict_dtcl)

In [None]:
#Train Data Accuracy
cart_train_acc=best_grid_dtcl.score(X_train,train_labels) 
cart_train_acc

In [None]:
print(classification_report(train_labels, ytrain_predict_dtcl))

In [None]:
cart_metrics=classification_report(train_labels, ytrain_predict_dtcl,output_dict=True)
df=pd.DataFrame(cart_metrics).transpose()
cart_train_f1=round(df.loc["1"][2],2)
cart_train_recall=round(df.loc["1"][1],2)
cart_train_precision=round(df.loc["1"][0],2)
print ('cart_train_precision ',cart_train_precision)
print ('cart_train_recall ',cart_train_recall)
print ('cart_train_f1 ',cart_train_f1)

### CART Confusion Matrix and Classification Report for the testing data

In [None]:
confusion_matrix(test_labels, ytest_predict_dtcl)

In [None]:
#Test Data Accuracy
cart_test_acc=best_grid_dtcl.score(X_test,test_labels)
cart_test_acc

In [None]:
print(classification_report(test_labels, ytest_predict_dtcl))

In [None]:
cart_metrics=classification_report(test_labels, ytest_predict_dtcl,output_dict=True)
df=pd.DataFrame(cart_metrics).transpose()
cart_test_precision=round(df.loc["1"][0],2)
cart_test_recall=round(df.loc["1"][1],2)
cart_test_f1=round(df.loc["1"][2],2)
print ('cart_test_precision ',cart_test_precision)
print ('cart_test_recall ',cart_test_recall)
print ('cart_test_f1 ',cart_test_f1)

### RF Model Performance Evaluation on Training data

In [None]:
confusion_matrix(train_labels,ytrain_predict_rfcl)

In [None]:
rf_train_acc=best_grid_rfcl.score(X_train,train_labels) 
rf_train_acc

In [None]:
print(classification_report(train_labels,ytrain_predict_rfcl))

In [None]:
rf_metrics=classification_report(train_labels, ytrain_predict_rfcl,output_dict=True)
df=pd.DataFrame(rf_metrics).transpose()
rf_train_precision=round(df.loc["1"][0],2)
rf_train_recall=round(df.loc["1"][1],2)
rf_train_f1=round(df.loc["1"][2],2)
print ('rf_train_precision ',rf_train_precision)
print ('rf_train_recall ',rf_train_recall)
print ('rf_train_f1 ',rf_train_f1)

In [None]:
rf_train_fpr, rf_train_tpr,_=roc_curve(train_labels,best_grid_rfcl.predict_proba(X_train)[:,1])
plt.plot(rf_train_fpr,rf_train_tpr,color='green')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
rf_train_auc=roc_auc_score(train_labels,best_grid_rfcl.predict_proba(X_train)[:,1])
print('Area under Curve is', rf_train_auc)

#### RF Model Performance Evaluation on Test data

In [None]:
confusion_matrix(test_labels,ytest_predict_rfcl)

In [None]:
rf_test_acc=best_grid_rfcl.score(X_test,test_labels)
rf_test_acc

In [None]:
print(classification_report(test_labels,ytest_predict_rfcl))

In [None]:
rf_metrics=classification_report(test_labels, ytest_predict_rfcl,output_dict=True)
df=pd.DataFrame(rf_metrics).transpose()
rf_test_precision=round(df.loc["1"][0],2)
rf_test_recall=round(df.loc["1"][1],2)
rf_test_f1=round(df.loc["1"][2],2)
print ('rf_test_precision ',rf_test_precision)
print ('rf_test_recall ',rf_test_recall)
print ('rf_test_f1 ',rf_test_f1)

In [None]:
rf_test_fpr, rf_test_tpr,_=roc_curve(test_labels,best_grid_rfcl.predict_proba(X_test)[:,1])
plt.plot(rf_test_fpr,rf_test_tpr,color='green')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
rf_test_auc=roc_auc_score(test_labels,best_grid_rfcl.predict_proba(X_test)[:,1])
print('Area under Curve is', rf_test_auc)

### NN Model Performance Evaluation on Training data

In [None]:
confusion_matrix(train_labels,ytrain_predict_nncl)

In [None]:
nn_train_acc=best_grid_nncl.score(X_train,train_labels) 
nn_train_acc

In [None]:
print(classification_report(train_labels,ytrain_predict_nncl))

In [None]:
nn_metrics=classification_report(train_labels, ytrain_predict_nncl,output_dict=True)
df=pd.DataFrame(nn_metrics).transpose()
nn_train_precision=round(df.loc["1"][0],2)
nn_train_recall=round(df.loc["1"][1],2)
nn_train_f1=round(df.loc["1"][2],2)
print ('nn_train_precision ',nn_train_precision)
print ('nn_train_recall ',nn_train_recall)
print ('nn_train_f1 ',nn_train_f1)

In [None]:
nn_train_fpr, nn_train_tpr,_=roc_curve(train_labels,best_grid_nncl.predict_proba(X_train)[:,1])
plt.plot(nn_train_fpr,nn_train_tpr,color='black')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
nn_train_auc=roc_auc_score(train_labels,best_grid_nncl.predict_proba(X_train)[:,1])
print('Area under Curve is', nn_train_auc)

### NN Model Performance Evaluation on Test data

In [None]:
confusion_matrix(test_labels,ytest_predict_nncl)

In [None]:
nn_test_acc=best_grid_nncl.score(X_test,test_labels)
nn_test_acc

In [None]:
print(classification_report(test_labels,ytest_predict_nncl))

In [None]:
nn_metrics=classification_report(test_labels, ytest_predict_nncl,output_dict=True)
df=pd.DataFrame(nn_metrics).transpose()
nn_test_precision=round(df.loc["1"][0],2)
nn_test_recall=round(df.loc["1"][1],2)
nn_test_f1=round(df.loc["1"][2],2)
print ('nn_test_precision ',nn_test_precision)
print ('nn_test_recall ',nn_test_recall)
print ('nn_test_f1 ',nn_test_f1)

In [None]:
nn_test_fpr, nn_test_tpr,_=roc_curve(test_labels,best_grid_nncl.predict_proba(X_test)[:,1])
plt.plot(nn_test_fpr,nn_test_tpr,color='black')
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
nn_test_auc=roc_auc_score(test_labels,best_grid_nncl.predict_proba(X_test)[:,1])
print('Area under Curve is', nn_test_auc)

#### Logistic Regression using sklearn library

In [None]:
grid={'penalty':['elasticnet','l2','none'],
      'solver':['newton-cg', 'saga'],
      'tol':[0.001,0.00001]}

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(max_iter=100000,n_jobs=1)

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = log_model, param_grid = grid, cv = 3,n_jobs=1,scoring='f1')

In [None]:
grid_search.fit(X_train, train_labels)

In [None]:
print(grid_search.best_params_,'\n')
print(grid_search.best_estimator_)

In [None]:
best_model = grid_search.best_estimator_

In [None]:
ytrain_predict = best_model.predict(X_train)
ytest_predict = best_model.predict(X_test)

In [None]:
ytest_predict_prob=best_model.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix, plot_confusion_matrix
plot_confusion_matrix(best_model,X_train,train_labels)
print(classification_report(train_labels, ytrain_predict),'\n');

In [None]:
plot_confusion_matrix(best_model,X_test,test_labels)
print(classification_report(test_labels, ytest_predict),'\n');

In [None]:
acc_score = best_model.score(X_train, train_labels)
acc_score

In [None]:
probs = best_model.predict_proba(X_train)
probs = probs[:, 1]

auc_train = roc_auc_score(train_labels, probs)
print('AUC:', auc_train)

train_fpr, train_tpr, train_thresholds = roc_curve(train_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='solid')

plt.plot(train_fpr, train_tpr);

In [None]:
acc_score_test = best_model.score(X_test, test_labels)
acc_score_test

In [None]:
probs = best_model.predict_proba(X_test)
probs = probs[:, 1]

auc_test = roc_auc_score(test_labels, probs)
print('AUC:', auc_test)

test_fpr, test_tpr, test_thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='solid')

plt.plot(test_fpr, test_tpr);

In [None]:
best_model.intercept_

### Building logistic model using stats method

In [None]:
df_new.info()

### One hot encoding for the data set

In [None]:
df1 = df_new.copy()
df1.head()

### extracting ODI Match format for logit regression

In [None]:
df1_model = df1[df1['Match_format']=='ODI']
df1_model.head()

### Since we have seperated Match format, no siginificance for 'Mactch_Format' variable

In [None]:
df1_model.drop('Match_format',axis=1, inplace=True)

In [None]:
df1_model.info()

In [None]:
df1_model = pd.get_dummies(df1_model,columns=None,drop_first = True)
df1_model.head()

In [None]:
df1_model.info()

In [None]:
X=df1_model.drop('Result_Win', axis=1)
Y = df1_model['Result_Win']

In [None]:
from statsmodels.sandbox.regression.predstd import wls_prediction_std
X = sm.add_constant(X)

In [None]:
import statsmodels.api as sm
logit_model = sm.Logit(Y,X)
result=logit_model.fit()
print(result.summary2())
print(result.params)

### One hot encoding + Sklearn

In [None]:
grid={'penalty':['elasticnet','l2','none'],
      'solver':['newton-cg', 'saga'],
      'tol':[0.001,0.00001]}

In [None]:
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(max_iter=100000,n_jobs=1)

In [None]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = log_model, param_grid = grid, cv = 3,n_jobs=1,scoring='f1')

In [None]:
X_ODI =df1_model.drop('Result_Win', axis=1)
Y_ODI = df1_model['Result_Win']

In [None]:
X_train, X_test, train_labels, test_labels = train_test_split(X_ODI, Y_ODI, test_size=.30, random_state=5)

In [None]:
grid_search.fit(X_train, train_labels)

In [None]:
print(grid_search.best_params_,'\n')
print(grid_search.best_estimator_)

In [None]:
best_model = grid_search.best_estimator_

In [None]:
ytrain_predict = best_model.predict(X_train)
ytest_predict = best_model.predict(X_test)

In [None]:
ytest_predict_prob=best_model.predict_proba(X_test)

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve, classification_report, confusion_matrix, plot_confusion_matrix
plot_confusion_matrix(best_model,X_train,train_labels)
print(classification_report(train_labels, ytrain_predict),'\n');

In [None]:
plot_confusion_matrix(best_model,X_test,test_labels)
print(classification_report(test_labels, ytest_predict),'\n');

In [None]:
acc_score = best_model.score(X_train, train_labels)
acc_score

In [None]:
probs = best_model.predict_proba(X_train)
probs = probs[:, 1]

auc_train = roc_auc_score(train_labels, probs)
print('AUC:', auc_train)

train_fpr, train_tpr, train_thresholds = roc_curve(train_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='solid')

plt.plot(train_fpr, train_tpr);

In [None]:
acc_score_test = best_model.score(X_test, test_labels)
acc_score_test

In [None]:
probs = best_model.predict_proba(X_test)
probs = probs[:, 1]

auc_test = roc_auc_score(test_labels, probs)
print('AUC:', auc_test)

test_fpr, test_tpr, test_thresholds = roc_curve(test_labels, probs)
plt.plot([0, 1], [0, 1], linestyle='solid')

plt.plot(test_fpr, test_tpr);

In [None]:
#for idx, col_name in enumerate(X_train_rfe.columns):
#    print("The coefficient for {} is {}".format(col_name, lm_model.coef_[idx]))