In [None]:
roadmap:

1.import
2.label encoding
3.get dummies
4.missing value analyze (dropna or ycimpute)
5.neighbor outliers (LocalOutlierFactor)
6.singular outliers (quantile)
7.automation
8.model selection
9.tune with GridSearchCV
10.early stopping and visualization
11.retune with early stopping
12.predict
13.getting results

### import

In [None]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, mean_squared_error, r2_score, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

### missing value analyze with ycimpute

In [None]:
from ycimpute.imputer import knnimput
feat_names=list(df)
ndf=np.array(df)
s1=knnimput.KNN(k = 5).complete(ndf)
s1=s1.round()
df=pd.DataFrame(s1,columns=feat_names)
df.isnull().sum()

### LocalOutlierFactor

In [None]:
from sklearn.neighbors import LocalOutlierFactor
lof=LocalOutlierFactor()
lof.fit_predict(df)
scores=lof.negative_outlier_factor_
scores

### singular outliers

In [None]:
q1=df.price.quantile(0.25)
q3=df.price.quantile(0.75)
iqr=q3-q1
down_limit=(q1-iqr*1.5)
up_limit=(q3+iqr*1.5)
down_limit,up_limit

### automation

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.20)

models=[LogisticRegression,
       KNeighborsClassifier,
       SVC,
       MLPClassifier,
       DecisionTreeClassifier,
       RandomForestClassifier,
       GradientBoostingClassifier,
       LGBMClassifier,
       XGBClassifier,
       CatBoostClassifier]

import time

def fitter(x_train,x_test,y_train,y_test,model):
        startt=time.time()
        if model==CatBoostClassifier:
            modelf=model().fit(x_train,y_train,verbose=False)
        else:
            modelf=model().fit(x_train,y_train)
        y_pred=modelf.predict(x_test)
        timer=time.time()-startt
        ascore=cross_val_score(model(),x_train,y_train ,cv=10, scoring="accuracy", n_jobs=-1).mean()
        f1score=cross_val_score(model(),x_train,y_train ,cv=10, scoring="f1", n_jobs=-1).mean()
        print("""
            Model:{}
            accuracy_score:{}
            f1_score:{}
            fit&predict time:{}
            """.format(model.__name__,ascore,f1score,timer))
        print("-"*60)


for i in models:
    fitter(x_train,x_test,y_train,y_test,i)

### GBM parameter tuning

In [None]:
#GBM default params:
#learning_rate=0.1
#n_estimators=100
#min_samples_split=2
#min_samples_leaf=1
#max_depth=3


gbm_params={
    "learning_rate":[0.01,0.1,0.3],
    "n_estimators":[100,500,2000],
    "min_samples_split":[2,4,6,7],
    "min_samples_leaf":[1,2,3],
    "max_depth":[3,5,7]
}

gbm_cv_model=GridSearchCV(GradientBoostingClassifier(),gbm_params,cv=5,verbose=2,n_jobs=-1).fit(x_train,y_train)

### lightGBM parameter tuning

In [None]:
lgbm_params={
    "learning_rate":[0.01,0.1,0.3],
    "num_leaves":[25,31,40,50],
    "n_estimators":[50,100,500,2000],
    "min_child_samples":[10,20,30],
    "num_iterations":[100,500,1000],
    "max_bin":[255,305,400]
}

lgbm_cv_model=GridSearchCV(LGBMClassifier(),lgbm_params,cv=5,verbose=2,n_jobs=-1).fit(x_train,y_train)

### lightGBM early stopping

In [None]:
lgbm_tuned=LGBMClassifier(learning_rate=0.01,
                         max_bin=255,
                         min_child_samples=10,
                         n_estimators=1000,
                         num_iterations=500,
                         num_leaves=31,
                         ).fit(x_train,y_train,
                                 eval_set=[(x_test,y_test)],
                                 eval_metric="error")

### feature importance

In [None]:
importance=pd.DataFrame(rf_tuned.feature_importances_*100,index=x_train.columns,columns=["importance"])
importance.sort_values(by="importance",axis=0,ascending=True).plot(kind="barh",color="r")
plt.xlabel("Variable Importance")
plt.gca().legend_=None

### scaler

In [None]:
from sklearn import preprocessing as pp
xtr_backup=x_train
xte_backup=x_test


sed=pp.scale(xtr_backup)
sed_xtr=pd.DataFrame(sed,columns=x_train.columns,index=x_train.index)
sed=pp.scale(xte_backup)
sed_xte=pd.DataFrame(sed,columns=x_test.columns,index=x_test.index)


x_train=sed_xtr
x_test=sed_xte

### classification results

In [None]:
y_pred=svc_tuned.predict(x_test)

import matplotlib.pyplot as plt
def plot_confusion_matrix(true_y, pred_y, title='Confusion Matrix', figsize=(8,6)):
    """ Custom function for plotting a confusion matrix for predicted results """
    conf_matrix = confusion_matrix(true_y, pred_y)
    conf_df = pd.DataFrame(conf_matrix, columns=np.unique(true_y), index = np.unique(true_y))
    conf_df.index.name = 'Actual'
    conf_df.columns.name = 'Predicted'
    plt.figure(figsize = figsize)
    plt.title(title)
    sns.set(font_scale=1.4)
    sns.heatmap(conf_df, cmap="Blues", annot=True, 
                annot_kws={"size": 16}, fmt='g')
    plt.show()
    return
plot_confusion_matrix(y_test,y_pred)

ascore=accuracy_score(y_test,y_pred)
f1score=f1_score(y_test,y_pred)

print("SCORES")
print("""accuracy_score:{}
f1_score:{}
""".format(ascore,f1score))

print(classification_report(y_test,y_pred))