roadmap:

1.import
2.label encoding
3.get dummies
2.neighbor outliers (LocalOutlierFactor)
3.singular outliers (quantile)
4.automation
5.model selection
6.tune with GridSearchCV
7.early stopping and visualization
8.retune with early stopping
9.predict
10.getting results
    

### imports

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import neighbors
from sklearn.svm import SVR
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

### LocalOutlierFactor

In [None]:
from sklearn.neighbors import LocalOutlierFactor
lof=LocalOutlierFactor()
lof.fit_predict(df)
scores=lof.negative_outlier_factor_

### quantiles 

In [None]:
q1=df.price.quantile(0.25)
q3=df.price.quantile(0.75)
iqr=q3-q1
down_limit=(q1-iqr*1.5)
up_limit=(q3+iqr*1.5)
down_limit,up_limit

### automation

In [None]:
models = [LGBMRegressor, 
          XGBRegressor, 
          GradientBoostingRegressor, 
          RandomForestRegressor, 
          DecisionTreeRegressor,
          MLPRegressor,
          KNeighborsRegressor, 
          SVR]

def otoml(x_train,y_train,x_test,y_test,alg):
    model=alg().fit(x_train,y_train)
    rmse=np.sqrt(mean_squared_error(y_test,model.predict(x_test)))
    print(alg.__name__,"algoritmasının RMSE test değeri:",rmse)

for alg in models:
    otoml(x_train,y_train,x_test,y_test,alg)

### GridSearchCV -lightgbm

In [None]:
lgbm_params={
    "learning_rate":[0.01,0.1,0.2],
    "num_leaves":[25,31,40],
    "n_estimators":[50,100,500,2000],
    "min_child_samples":[10,20,30]
}

lgbm_cv_model=GridSearchCV(LGBMRegressor(),lgbm_params,cv=5,verbose=2,n_jobs=-1).fit(x_train,y_train)

### early stopping -lightgbm

In [None]:
lgbm_tuned=LGBMRegressor(learning_rate=0.01,
                        min_child_samples=20,
                        n_estimators=2000,
                        num_leaves=40).fit(x_train,y_train,eval_metric="rmse",
                                          eval_set=[(x_test,y_test)],early_stopping_rounds=2000)

### early stopping and visiualization - xgb

In [None]:
eval_set=[(x_train,y_train),(x_test,y_test)]
model=XGBRegressor(n_estimators=5000,max_depth=8)
model.fit(x_train,y_train,eval_set=eval_set,eval_metric=["rmse","mae"],verbose=True,early_stopping_rounds=50)

results=model.evals_result()
results

from matplotlib import pyplot
results = model.evals_result()
epochs = len(results['validation_0']['rmse'])
x_axis = range(0, epochs)
# plot log loss
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['mae'], label='Train')
ax.plot(x_axis, results['validation_1']['mae'], label='Test')
ax.legend()
pyplot.ylabel('MAE')
pyplot.title('XGBoost MAE')
pyplot.show()
# plot classification error
fig, ax = pyplot.subplots()
ax.plot(x_axis, results['validation_0']['rmse'], label='Train')
ax.plot(x_axis, results['validation_1']['rmse'], label='Test')
ax.legend()
pyplot.ylabel('RMSE')
pyplot.title('XGBoost RMSE')
pyplot.show()

### results -lightgbm

In [None]:
lgbm_tuned=LGBMRegressor(learning_rate=0.01,
                        min_child_samples=20,
                        n_estimators=1998,
                        num_leaves=40).fit(x_train,y_train)

y_pred=lgbm_tuned.predict(x_test)

print("Train setinde rmse hatası:",np.sqrt(mean_squared_error(y_train,lgbm_tuned.predict(x_train))))
print("Test setinde rmse hatası:",np.sqrt(mean_squared_error(y_test,y_pred)))
print("-"*30)
print("Train setinde r2_score:",r2_score(y_train,lgbm_tuned.predict(x_train)))
print("Test setinde r2_score",r2_score(y_test,y_pred))

### results with dataframe

In [None]:
test_predict=y_pred
train_predict=lgbm_tuned.predict(x_train)

train_report=pd.concat([pd.DataFrame(data=[y_train]).T,pd.DataFrame(data=[pd.Series(train_predict,index=y_train.index,name="price")]).T],axis=1)
test_report=pd.concat([pd.DataFrame(data=[y_test]).T,pd.DataFrame(data=[pd.Series(test_predict,index=y_test.index,name="price")]).T],axis=1)
train_report.columns=["true","pred"]
test_report.columns=["true","pred"]
train_report["error"]=abs(train_report.true-train_report.pred)
test_report["error"]=abs(test_report.true-test_report.pred)

### feature importance

In [None]:
importance=pd.DataFrame(rf_tuned.feature_importances_*100,index=x_train.columns,columns=["importance"])
importance.sort_values(by="importance",axis=0,ascending=True).plot(kind="barh",color="r")
plt.xlabel("Variable Importance")
plt.gca().legend_=None

### scaler

In [None]:
from sklearn import preprocessing as pp
sed=pp.scale(xtr_backup)
sed_xtr=pd.DataFrame(sed,columns=x_train.columns,index=x_train.index)
sed=pp.scale(xte_backup)
sed_xte=pd.DataFrame(sed,columns=x_test.columns,index=x_test.index)

xtr_backup=x_train
xte_backup=x_test

x_train=sed_xtr
x_test=sed_xte