In [None]:
from numpy import mean
from numpy import std
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV

In [None]:
#pip install 'lightgbm[pandas]'

In [None]:
df = pd.read_csv("Complete-data.csv")
df

In [None]:
df.drop(columns=['Landslide'])

# **EDA**

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
temp_cols=df.columns.tolist()
new_cols=temp_cols[1:] + temp_cols[0:1]
df=df[new_cols]

In [None]:
df.describe().round(2)

In [None]:
for column in df.columns[0:]:
    print(column, ': ', len(df[column].unique()), ' labels')

In [None]:
landslide_count = df['Landslide'].value_counts()
sns.set(style="darkgrid")
sns.countplot(x = 'Landslide', data = df)
plt.title('Frequency Distribution of Landslides')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Landslide', fontsize=12)


In [None]:
perc_count = df['Precipitation'].value_counts()
sns.set(style="darkgrid")
sns.barplot(perc_count.index)
plt.title('Frequency Distribution of Precipitation')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Precipitation', fontsize=12)
plt.show()

plt.savefig('bar2.eps', format='eps')

In [None]:
lith_count = df['Lithology'].value_counts()
sns.set(style="darkgrid")
sns.barplot(lith_count.index)
plt.title('Frequency Distribution of Lithology')
plt.ylabel('Number of Occurrences', fontsize=12)
plt.xlabel('Lithology', fontsize=12)
plt.show()

plt.savefig('bar3.eps', format='eps')

In [None]:
labels = df['Plan'].astype('category').cat.categories.tolist()
counts = df['Plan'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct  show the % on plot
ax1.axis('equal')
plt.title('Pieplot of Plan')
plt.show()

plt.savefig('pie1.eps', format='eps')

In [None]:
labels = df['Flow'].astype('category').cat.categories.tolist()
counts = df['Flow'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.title('Pieplot of Flow')
plt.show()

plt.savefig('pie2.eps', format='eps')

In [None]:
labels = df['Curvature'].astype('category').cat.categories.tolist()
counts = df['Curvature'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is used to show the % on plot
ax1.axis('equal')
plt.title('Pieplot of Curvature')
plt.show()

plt.savefig('pie3.eps', format='eps')

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(df.corr(), annot=True)

plt.show()

plt.savefig('corr1.eps', format='eps')

In [None]:
df.mean()

In [None]:
df.groupby(['Landslide']).mean()

In [None]:
df.corr()

# Important Features using Mutual Information Classification¶

In [None]:
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV

In [None]:
train_ratio = 0.8
val_ratio = 0.2
train_df, val_df = train_test_split(df, train_size = train_ratio, random_state=1)

In [None]:
features = list(df.columns.values.tolist())
df_feat = df.iloc[:, :-1]
features_wo = list(df_feat.columns.values.tolist())

In [None]:
df_feat.info()

In [None]:
MI = mutual_info_classif(df[features_wo], df["Landslide"], n_neighbors=20, random_state=42)

plt.figure(figsize=(5.4, 6))
plt.barh(width=MI, y=features_wo, color="#990303") 
plt.title("Mutual Information w.r.t. Landslide")
plt.xlabel("Landslide")
plt.gca().xaxis.grid(True, linestyle=':')
plt.tight_layout()

plt.savefig('mi.eps', format='eps')


In [None]:
best3 = ['Precipitation', 'Earthquake', 'Aspect']
best5 = ['Lithology', 'Flow']
best5.extend(best3)
best8 = ['Profile', 'NDWI', 'NDVI']
best8.extend(best5)
all_feat = features_wo

feat  = [best3, best5, best8, all_feat]

In [None]:
for i in feat:
    print(df[i].head(2))

# Feature Engineering!

In [None]:
df.head()

Feature **Selection**   

In [None]:
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn import ensemble
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import GridSearchCV

In [None]:
from pca import pca

X_pca = df.loc[:, df.columns != 'Landslide']

model = pca()
out = model.fit_transform(X_pca)
out = model.transform(X_pca)
print(pd.DataFrame(out, columns=['PC1', 'PC2', 'PC3','PC4','PC5','PC6','PC7','PC8','PC9']))
print(out)

In [None]:
model.plot()

plt.savefig('pca1.eps', format='eps')

# Baseline ***Model***

In [None]:
y = df.Landslide
df1 = df.loc[:, df.columns != 'Landslide']

X_train, X_test, y_train, y_test = train_test_split(df1, y, test_size=0.2)


In [None]:
baseline_gbm = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2,
                                          min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
baseline_gbm.fit(X_train,y_train)
predictors_gbm=list(X_train)

print('Accuracy of the GBM on test set: {:.3f}'.format(baseline_gbm.score(X_test, y_test)))
pred_gbm=baseline_gbm.predict(X_test)
print(classification_report(y_test, pred_gbm))


In [None]:
from lightgbm import LGBMClassifier
baseline_lgbm = LGBMClassifier(learning_rate=0.1, n_estimators=100,max_depth=3,
                          #min_samples_split=2,
                          min_samples_leaf=1, subsample=1,
                          #max_features='sqrt',
                          random_state=10)
baseline_lgbm.fit(X_train,y_train)
predictors_lgbm=list(X_train)

print('Accuracy of the LGBM on test set: {:.3f}'.format(baseline_lgbm.score(X_test, y_test)))
pred_lgbm=baseline_lgbm.predict(X_test)
print(classification_report(y_test, pred_lgbm))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [None]:
# Create a Random Forest Classifier
random_forest = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)

In [None]:
!pip install tabulate

In [None]:
from tabulate import  tabulate

# Fit the model to the training data
random_forest.fit(X_train, y_train)

# Make predictions on the test set
pred_rf = random_forest.predict(X_test)
table = [[i+1, pred_rf[i]] for i in range(len(y_test))]
print(tabulate(table, headers=['Id', 'Prediction'], tablefmt='grid'))
# Calculate and print the accuracy
accuracy_rf = random_forest.score(X_test, y_test)
print('Accuracy of the Random Forest on test set: {:.3f}'.format(accuracy_rf))

# Generate and print the classification report
print(classification_report(y_test, pred_rf))

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb
from sklearn.metrics import classification_report

# Create an XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    min_child_weight=1,
    gamma=0,
    subsample=1,
    colsample_bytree=1,
    objective='binary:logistic',  # For binary classification
    random_state=10
)

# Fit the model to the training data
xgb_classifier.fit(X_train, y_train)

# Make predictions on the test set
pred_xgb = xgb_classifier.predict(X_test)
from tabulate import tabulate
y_pred = xgb_classifier.predict(X_test)
headers = ['Index', 'Predicted Label']
table = [[i, labels] for i, labels in enumerate(y_pred, start=1)]
print(tabulate(table, headers=headers, tablefmt='grid'))

# Calculate and print the accuracy
accuracy_xgb = xgb_classifier.score(X_test, y_test)
print('Accuracy of XGBoost on test set: {:.3f}'.format(accuracy_xgb))

# Generate and print the classification report
print(classification_report(y_test, pred_xgb))

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import classification_report

# Define individual models
gbm_model = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=1,
    max_features='sqrt',
    random_state=10
)

rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=3,
    random_state=10
)

# Create an ensemble of models
ensemble_model = VotingClassifier(
    estimators=[
        ('gbm', gbm_model),
        ('rf', rf_model),
        # Add more models here if desired
    ],
    voting='soft'  # 'soft' for weighted voting based on probabilities
)

# Fit the ensemble model on the training data
ensemble_model.fit(X_train, y_train)

# Make predictions on the test set
pred_ensemble = ensemble_model.predict(X_test)
headers = ['Index', 'Landslide Occurence']
table = [[i, labels] for i, labels in enumerate(y_pred, start=1)]
print(tabulate(table, headers=headers, tablefmt='grid'))
print(pred_ensemble)
# Calculate and print the accuracy
accuracy_ensemble = ensemble_model.score(X_test, y_test)
print('Accuracy of the Ensemble Model on test set: {:.3f}'.format(accuracy_ensemble))

# Generate and print the classification report
print(classification_report(y_test, pred_ensemble))

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Create an SVM classifier
svm_classifier = SVC(kernel='rbf', C=1.0, random_state=10)

# Fit the SVM model to the training data
svm_classifier.fit(X_train, y_train)

# Make predictions on the test set
pred_svm = svm_classifier.predict(X_test)


# Calculate and print the accuracy
accuracy_svm = svm_classifier.score(X_test, y_test)
print('Accuracy of the SVM on test set: {:.3f}'.format(accuracy_svm))

# Generate and print the classification report
print(classification_report(y_test, pred_svm))

In [None]:
from tabulate import tabulate
y_pred = svm_classifier.predict(X_test)
headers = ['Index', 'Predicted Label']
table = [[i, labels] for i, labels in enumerate(y_pred, start=1)]
print(tabulate(table, headers=headers, tablefmt='grid'))

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import classification_report

# Define the GBM model
gbm_model = GradientBoostingClassifier(
    learning_rate=0.1,
    n_estimators=100,
    max_depth=3,
    min_samples_split=2,
    min_samples_leaf=1,
    subsample=1,
    max_features='sqrt',
    random_state=10
)

# Define the SVM model
svm_model = SVC(probability=True, random_state=10)  # Use probability=True to enable soft voting

# Create an ensemble of models
ensemble_model = VotingClassifier(
    estimators=[
        ('gbm', gbm_model),
        ('svm', svm_model),
        ('lgbm', LGBMClassifier())
    ],
    voting='soft'  # 'soft' for weighted voting based on probabilities
)

# Fit the ensemble model on the training data
ensemble_model.fit(X_train, y_train)

# Make predictions on the test set
pred_ensemble = ensemble_model.predict(X_test)

# Calculate and print the accuracy
accuracy_ensemble = ensemble_model.score(X_test, y_test)
print('Accuracy of the Ensemble Model on test set: {:.3f}'.format(accuracy_ensemble))

# Generate and print the classification report
print(classification_report(y_test, pred_ensemble))


# **Tuning of Models**



n_estimators and Learning Rate¶

In [None]:
p_test3 = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[50,100,250,500,750,1000,1250,1500,1750]}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(max_depth=4, min_samples_split=2, min_samples_leaf=1,
                                                            subsample=1,max_features='sqrt', random_state=10),
                                                            param_grid = p_test3, scoring='accuracy',n_jobs=4,
                                                             cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
learning_rate_gbm = tuning.best_params_.get("learning_rate")
n_estimators_gbm = tuning.best_params_.get("n_estimators")

In [None]:
p_test3a = {'learning_rate':[0.15,0.1,0.05,0.01,0.005,0.001], 'n_estimators':[50,100,250,500,750,1000,1250,1500,1750]}

tuning = GridSearchCV(estimator =LGBMClassifier(max_depth=4, min_samples_leaf=1,
                                                subsample=1, random_state=10),
                                                param_grid = p_test3a, scoring='accuracy',n_jobs=4,
                                                cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
learning_rate_lgbm = tuning.best_params_.get("learning_rate")
n_estimators_lgbm = tuning.best_params_.get("n_estimators")

Max depth

In [None]:
p_test2 = {'max_depth':[2,3,4,5,6,7] }
tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=learning_rate_gbm,n_estimators=n_estimators_gbm,
                                                            min_samples_split=2,
                                                            min_samples_leaf=1, subsample=1,max_features='sqrt',
                                                            random_state=10),
                                                            param_grid = p_test2, scoring='accuracy',n_jobs=4,
                                                            cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
max_depth_gbm = tuning.best_params_.get("max_depth")

In [None]:
p_test2a = {'max_depth':[2,3,4,5,6,7] }
tuning = GridSearchCV(estimator =LGBMClassifier(learning_rate=learning_rate_lgbm,n_estimators=n_estimators_lgbm,
                                                min_samples_leaf=1, subsample=1, random_state=10),
                                                param_grid = p_test2, scoring='accuracy',n_jobs=4,
                                                cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
max_depth_lgbm = tuning.best_params_.get("max_depth")

First Evaluation of New Model on Test Set

In [None]:
model1 = GradientBoostingClassifier(learning_rate=learning_rate_gbm, n_estimators=n_estimators_gbm,max_depth=max_depth_gbm,
                                    min_samples_split=2,
                                    min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
model1.fit(X_train,y_train)
predictors=list(X_train)
print('Accuracy of the GBM on test set: {:.3f}'.format(model1.score(X_test, y_test)))
pred=model1.predict(X_test)
print(classification_report(y_test, pred))

In [None]:
model1_lgbm = LGBMClassifier(learning_rate=learning_rate_lgbm, n_estimators=n_estimators_lgbm,max_depth=max_depth_lgbm,
                             min_samples_leaf=1, subsample=1, random_state=10)
model1_lgbm.fit(X_train,y_train)
predictors=list(X_train)
print('Accuracy of the LGBM on test set: {:.3f}'.format(model1_lgbm.score(X_test, y_test)))
pred=model1_lgbm.predict(X_test)
print(classification_report(y_test, pred))

## **Min Sample Split and Min Samples Leaf**

In [None]:
p_test4 = {'min_samples_split':[2,4,6,8,10,20,40,60,100], 'min_samples_leaf':[1,3,5,7,9]}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=learning_rate_gbm, n_estimators=n_estimators_gbm,
                                                            max_depth=max_depth_gbm,
                                                            subsample=1,max_features='sqrt', random_state=10),
                                                            param_grid = p_test4, scoring='accuracy',n_jobs=4,
                                                            cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
min_samples_leaf_gbm = tuning.best_params_.get("min_samples_leaf")

In [None]:
p_test4a = {'min_samples_leaf':[1,3,5,7,9]}

tuning = GridSearchCV(estimator =LGBMClassifier(learning_rate=learning_rate_lgbm, n_estimators=n_estimators_lgbm,
                                                max_depth=max_depth_lgbm,
                                                subsample=1, random_state=10),
                                                param_grid = p_test4a, scoring='accuracy',n_jobs=4,
                                                cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
min_samples_leaf_lgbm = tuning.best_params_.get("min_samples_leaf")

Max Features

In [None]:
p_test5 = {'max_features':[2,3,4,5,6,7]}
tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=learning_rate_gbm, n_estimators=n_estimators_gbm,
                                                            max_depth=max_depth_gbm,
                                                            min_samples_split=20, min_samples_leaf=min_samples_leaf_gbm,
                                                            subsample=1,
                                                            random_state=10), param_grid = p_test5, scoring='accuracy',
                                                            n_jobs=4, cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
max_features_gbm = tuning.best_params_.get("max_features")

Subsample

In [None]:
p_test6= {'subsample':[0.7,0.75,0.8,0.85,0.9,0.95,1]}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=learning_rate_gbm, n_estimators=n_estimators_gbm,
                                                            max_depth=max_depth_gbm,
                                                            min_samples_split=20, min_samples_leaf=min_samples_leaf_gbm,
                                                            max_features=max_features_gbm ,
                                                            random_state=10), param_grid = p_test6, scoring='accuracy',
                                                            n_jobs=4, cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
subsample_gbm = tuning.best_params_.get("subsample")

In [None]:
p_test6a= {'subsample':[0.7,0.75,0.8,0.85,0.9,0.95,1]}

tuning = GridSearchCV(estimator =LGBMClassifier(learning_rate=learning_rate_lgbm, n_estimators=n_estimators_lgbm,
                                                max_depth=max_depth_lgbm,
                                                min_samples_leaf=min_samples_leaf_lgbm,
                                                random_state=10), param_grid = p_test6, scoring='accuracy',
                                                n_jobs=4, cv=5)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
subsample_lgbm = tuning.best_params_.get("subsample")

random_state

In [None]:
p_test7= {'random_state':list(range(0,101,2))}

tuning = GridSearchCV(estimator =GradientBoostingClassifier(learning_rate=learning_rate_gbm, n_estimators=n_estimators_gbm,
                                                            max_depth=max_depth_gbm,
                                                            min_samples_split=20, min_samples_leaf=min_samples_leaf_gbm,
                                                            max_features=max_features_gbm ,
                                                            subsample=subsample_gbm), param_grid = p_test7,
                                                            scoring='accuracy',
                                                            n_jobs=4, cv=6)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
random_state_gbm = tuning.best_params_.get("random_state")

In [None]:
p_test7a= {'random_state':list(range(0,101,2))}

tuning = GridSearchCV(estimator =LGBMClassifier(learning_rate=learning_rate_lgbm, n_estimators=n_estimators_lgbm,
                                                max_depth=max_depth_lgbm, min_samples_leaf=min_samples_leaf_lgbm,
                                                subsample=subsample_lgbm), param_grid = p_test7a, scoring='accuracy',
                                                n_jobs=4, cv=8)
tuning.fit(X_train,y_train)
tuning.best_params_, tuning.best_score_
random_state_lgbm = tuning.best_params_.get("random_state")

Optimal Parameters

In [None]:
a1 = []
a2 = []
a3 = []
a4 = []
a5 = []
a6 = []
a7 = []
a8 = []

In [None]:
a1.append('GBM')
a2.append(learning_rate_gbm)
a3.append(n_estimators_gbm)
a4.append(max_depth_gbm)
a5.append(min_samples_leaf_gbm)
a6.append(max_features_gbm)
a7.append(subsample_gbm)
a8.append(random_state_gbm)

opt_par = pd.DataFrame({'Name': a1, 'learning_rate': a2, 'n_estimators': a3,
                         'max_depth': a4, 'min_samples_leaf': a5, 'max_features': a6,
                         'subsample_gbm': a7, 'random_state': a8})

In [None]:
a1.append('LGBM')
a2.append(learning_rate_lgbm)
a3.append(n_estimators_lgbm)
a4.append(max_depth_lgbm)
a5.append(min_samples_leaf_lgbm)
a6.append(max_features_gbm)
a7.append(subsample_lgbm)
a8.append(random_state_lgbm)

opt_par = pd.DataFrame({'Name': a1, 'learning_rate': a2, 'n_estimators': a3,
                         'max_depth': a4, 'min_samples_leaf': a5, 'max_features': a6,
                         'subsample_gbm': a7, 'random_state': a8})

In [None]:
opt_par

# Evaluation of Final Model on Test Set

In [None]:
new=GradientBoostingClassifier(learning_rate=learning_rate_gbm, n_estimators=n_estimators_gbm,
                               max_depth=max_depth_gbm, min_samples_split=20, min_samples_leaf=min_samples_leaf_gbm,
                               max_features=max_features_gbm, subsample=subsample_gbm, random_state=random_state_gbm)
new.fit(X_train,y_train)
predictors=list(X_train)
print('Accuracy of the GBM on test set: {:.3f}'.format(new.score(X_test, y_test)))
pred=new.predict(X_test)

print(classification_report(y_test, pred)) 

In [None]:
new_lgbm=LGBMClassifier(learning_rate=learning_rate_lgbm, n_estimators=n_estimators_lgbm,max_depth=max_depth_lgbm,
                        min_samples_leaf=min_samples_leaf_lgbm, subsample=subsample_lgbm, random_state=random_state_lgbm)
new_lgbm.fit(X_train,y_train)
predictors=list(X_train)
print('Accuracy of the LGBM on test set: {:.3f}'.format(new_lgbm.score(X_test, y_test)))
pred=new_lgbm.predict(X_test)
print(classification_report(y_test, pred))

#Comparison of ROC AUC

In [None]:
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

#run models
baseline = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100,max_depth=3, min_samples_split=2,
                                      min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
baseline.fit(X_train,y_train)

model1 = GradientBoostingClassifier(learning_rate=learning_rate_gbm, n_estimators=n_estimators_gbm,max_depth=max_depth_gbm,
                                    min_samples_split=2,
                                    min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10)
model1.fit(X_train,y_train)

new=GradientBoostingClassifier(learning_rate=learning_rate_gbm, n_estimators=n_estimators_gbm,
                               max_depth=max_depth_gbm, min_samples_split=20, min_samples_leaf=min_samples_leaf_gbm,
                               max_features=max_features_gbm, subsample=subsample_gbm, random_state=random_state_gbm)
new.fit(X_train,y_train)

#baseline model
baseline_roc_auc = roc_auc_score(y_test, baseline.predict(X_test))
fprB, tprB, thresholdsB = roc_curve(y_test, baseline.predict_proba(X_test)[:,1])
#model 1
# model1_roc_auc = roc_auc_score(y_test, model1.predict(X_test))
# fpr1, tpr1, thresholds1 = roc_curve(y_test, model1.predict_proba(X_test)[:,1])
#new tuned model
new_roc_auc = roc_auc_score(y_test, new.predict(X_test))
fprnew, tprnew, thresholds_new = roc_curve(y_test, new.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fprB, tprB, label='GBM Baseline (area = %0.2f)' % baseline_roc_auc)
#plt.plot(fpr1, tpr1, label='GBM Model 1 (area = %0.2f)' % model1_roc_auc)
plt.plot(fprnew, tprnew, label='GBM Final Model (area = %0.2f)' % new_roc_auc)

plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic of GBM')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

plt.savefig('roc1.eps', format='eps')


In [None]:
print('Accuracy of the GBM on test set for Baseline Model: {:.3f}'.format(baseline.score(X_test, y_test)))
print('Accuracy of the GBM on test set for New Model: {:.3f}'.format(new.score(X_test, y_test)))

In [None]:
#run models
baseline_lgbm = LGBMClassifier(learning_rate=0.1, n_estimators=100,max_depth=3,
                               min_samples_leaf=1, subsample=1, random_state=10)
baseline_lgbm.fit(X_train,y_train)

model1_lgbm = LGBMClassifier(learning_rate=learning_rate_lgbm, n_estimators=n_estimators_lgbm,max_depth=max_depth_lgbm,
                             min_samples_leaf=1, subsample=1, random_state=10)
model1_lgbm.fit(X_train,y_train)

new_lgbm=LGBMClassifier(learning_rate=learning_rate_lgbm, n_estimators=n_estimators_lgbm,max_depth=max_depth_lgbm,
                        min_samples_leaf=min_samples_leaf_lgbm, subsample=subsample_lgbm, random_state=random_state_lgbm)
new_lgbm.fit(X_train,y_train)

#calculate values for ROC AUC plot
#baseline model
baseline_lgbm_roc_auc = roc_auc_score(y_test, baseline_lgbm.predict(X_test))
fprB, tprB, thresholdsB = roc_curve(y_test, baseline_lgbm.predict_proba(X_test)[:,1])
#model 1
# model1_lgbm_roc_auc = roc_auc_score(y_test, model1_lgbm.predict(X_test))
# fpr1, tpr1, thresholds1 = roc_curve(y_test, model1_lgbm.predict_proba(X_test)[:,1])
#new tuned model
new_lgbm_roc_auc = roc_auc_score(y_test, new_lgbm.predict(X_test))
fprnew, tprnew, thresholds_new = roc_curve(y_test, new_lgbm.predict_proba(X_test)[:,1])

plt.figure()
plt.plot(fprB, tprB, label='LGBM Baseline (area = %0.2f)' % baseline_roc_auc)
#plt.plot(fpr1, tpr1, label='LGBM Model 1 (area = %0.2f)' % model1_roc_auc)
plt.plot(fprnew, tprnew, label='LGBM Final Model (area = %0.2f)' % new_roc_auc)

plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic of LGBM')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()

plt.savefig('roc2.eps', format='eps')

In [None]:
print('Accuracy of the LGBM on test set for Baseline Model: {:.3f}'.format(baseline_lgbm.score(X_test, y_test)))
print('Accuracy of the LGBM on test set for Model1: {:.3f}'.format(model1_lgbm.score(X_test, y_test)))
print('Accuracy of the LGBM on test set for New Model: {:.3f}'.format(new_lgbm.score(X_test, y_test)))