<a href="https://colab.research.google.com/github/royaldevops/Machine-Learning-Notebooks/blob/main/getaroom_catboost_v4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## $$1.\ Data\ Preparation$$
### 1.1 CatBoost installation
If you have not already installed CatBoost, you can do so by running '!pip install catboost' command.  
  
Also you should install ipywidgets package and run special command before launching jupyter notebook to draw plots.

In [None]:
# !conda install --yes catboost
!pip install catboost
!pip install ipywidgets
!jupyter nbextension enable --py widgetsnbextension
!pip3 install tensorflow

### 1.2 Import necessary libraries 

In [None]:
from google.colab import files
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool, metrics, cv, MetricVisualizer
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from math import sqrt
from sklearn.model_selection import GridSearchCV
import sklearn
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
import tensorflow as tf
%matplotlib inline

### 1.3 Data Loading

In [None]:
uploaded = files.upload()

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [None]:
train_df.head()

### 1.4 Feature Preparation
First of all let's check how many absent values do we have:

In [None]:
#Let us drop the Property_ID 
train_df.drop('Property_ID',axis=1,inplace=True)
Property_IDs = test_df.pop("Property_ID")

In [None]:
#list out the categorical features 
cat_features = list(train_df.select_dtypes(include=['object','category']).columns)
cat_features

In [None]:
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

In [None]:
#from the above observations we had null values in categorical features fill those with string 'NaN'
train_df['Furnishing'] =train_df['Furnishing'].fillna('NaN') 
train_df['Crime_Rate'] =train_df['Crime_Rate'].fillna('NaN') 
train_df['Dust_and_Noise'] =train_df['Dust_and_Noise'].fillna('NaN') 

test_df['Furnishing'] =test_df['Furnishing'].fillna('NaN') 
test_df['Crime_Rate'] =test_df['Crime_Rate'].fillna('NaN') 
test_df['Dust_and_Noise'] =test_df['Dust_and_Noise'].fillna('NaN') 

In [None]:
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

In [None]:
train_df.describe().T

Now let's separate features and label variable:

In [None]:
X = train_df.drop('Habitability_score', axis=1)
y = train_df.Habitability_score

### 1.5 Data Splitting
Let's split the train data into training and validation sets.

In [None]:
X_train, X_validation, y_train, y_validation = train_test_split(X, y, test_size=0.20, random_state=42)
X_test = test_df

### 1.6 Common Functions
Define common functions for metrics

In [None]:
def rmlse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(tf.math.log(y_pred + 1) - tf.math.log(y_true + 1))))

In [None]:
def evaluate(model, x_val, y_val):
    y_pred = model.predict(x_val)
    r2 = metrics.r2_score(y_val, y_pred)
    mse = metrics.mean_squared_error(y_val, y_pred)
    mae = metrics.mean_absolute_error(y_val, y_pred)
    msle = metrics.mean_squared_log_error(y_val, y_pred)
    mape = np.mean(tf.keras.metrics.mean_absolute_percentage_error(y_val, y_pred).numpy())
    rmse = np.sqrt(mse)
    rmlse_score = rmlse(y_val, y_pred).numpy()
    print("R2 Score:", r2)
    print("MSE:", mse)
    print("MAE:", mae)
    print("MSLE:", msle)
    print("MAPE", mape)
    print("RMSE:", rmse)
    print("RMLSE", rmlse_score)
    # return {"r2": r2, "mse": mse, "mae": mae, "msle": msle, "mape": mape, "rmse": rmse, "rmlse": rmlse_score}

In [None]:
def submit(model, X, ids, file_path):
    Habitability_score = model.predict(X)
    submission = pd.DataFrame({"Property_ID": ids, "Habitability_score": Habitability_score.reshape(-1)})
    submission.to_csv(file_path, index=False)

## $$2.\ CatBoost\ Model$$

### 2.1 Model Training 

In [None]:
default_param_model = CatBoostRegressor(train_dir='default_param_model')
default_param_model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose = 100,
    plot = True
);
# evaluate(default_param_model, X_validation, y_validation)
default_param_model.score(X_validation, y_validation)

In [None]:
model = CatBoostRegressor(l2_leaf_reg=3, learning_rate=0.09,iterations=292,depth=12,train_dir='l2_leaf_reg_4_5')
model.fit(
    X_train, y_train,
    cat_features=cat_features,
    eval_set=(X_validation, y_validation),
    verbose = 100,
    plot = True
);
# evaluate(model, X_validation, y_validation)
model.score(X_validation, y_validation)

### 2.2 Model Comparison

In [None]:
MetricVisualizer(['default_param_model','l2_leaf_reg_4_5']).start()

### 2.3 Model Cross-Validation

In [None]:
def model_cv(model):
  cv_params = model.get_params()
  if 'od_type' in cv_params:
    del cv_params['od_type']
  cv_data = cv(
      Pool(X, y, cat_features=cat_features),
      cv_params,
      plot=True,
      verbose = 100,
      shuffle=True,
  )
  print('Best validation RMSE score: {:.2f}Â±{:.2f} on step {}'.format(
    np.max(cv_data['test-RMSE-mean']),
    cv_data['test-RMSE-std'][np.argmax(cv_data['test-RMSE-mean'])],
    np.argmax(cv_data['test-RMSE-mean'])))
  print('Precise validation RMSE score: {}'.format(np.max(cv_data['test-RMSE-mean'])))

In [None]:
model_cv(model)

Now we have values of our loss functions at each boosting step averaged by 3 folds, which should provide us with a more accurate estimation of our model performance:

### 2.3 Feature Importance

In [None]:
def feature_importance(model):
    # Create a dataframe of feature importance 
    df_feature_importance = pd.DataFrame(model.get_feature_importance(prettified=True))
    #plotting feature importance
    plt.figure(figsize=(12, 6));
    feature_plot= sns.barplot(x="Importances", y="Feature Id", data=df_feature_importance,palette="cool");
    plt.title('features importance');

In [None]:
feature_importance(model)

From the above plot we can see that :

Furnishing, Neighborhood Review and Power Backup score has a major impact Habitability score followed by
Property Area, Crime Rate, Dust and Noise, Water Supply, Number of Windows, Property Type, Traffic Desncity Score.
Air Quality Index, Frequency of powercuts and number of doors are not much significant in the prediction of Habitability scores.

### 2.3 Model Applying

In [None]:
predictions = model.predict(X_test)
print(min(predictions))

## $$3.\ CatBoost\ Features$$
Let's define some params and create `Pool` for more convenience. It stores all information about dataset (features, labeles, categorical features indices, weights and and much more).

In [None]:
train_pool = Pool(X_train, y_train, cat_features=cat_features)
validate_pool = Pool(X_validation, y_validation, cat_features=cat_features)

In [None]:
select_features_model = CatBoostRegressor()
selected_features = select_features_model.select_features(train_pool,plot=True,verbose=1000,num_features_to_select=10,features_for_select='0-12')

In [None]:
selected_features

In [None]:
evaluate(select_features_model, X_validation, y_validation)

### 3.1 Using the best model
If you essentially have a validation set, it's always better to use the `use_best_model` parameter during training. By default, this parameter is enabled. If it is enabled, the resulting trees ensemble is shrinking to the best iteration.

In [None]:
params = {'use_best_model': False}
simple_model = CatBoostRegressor(**params)
simple_model.fit(train_pool, eval_set=validate_pool,verbose=500)

best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostRegressor(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool,verbose=500);

print('Simple model validation R2 Score: {:.4}'.format(
    metrics.r2_score(y_validation, simple_model.predict(X_validation))
))
print('')

print('Best model validation R2 Score: {:.4}'.format(
    metrics.r2_score(y_validation, best_model.predict(X_validation))
))

# $$4.\ Parameters\ Tuning$$
While you could always select optimal number of iterations (boosting steps) by cross-validation and learning curve plots, it is also important to play with some of model parameters, and we would like to pay some special attention to `l2_leaf_reg` and `learning_rate`.

In this section, we'll select these parameters using the **`grid_search`** 

In [None]:
# grid_model = CatBoostRegressor()
# grid = {'learning_rate': [0.08],'iterations': [1500],'depth': [8,10,12,14,16],
#         'l2_leaf_reg': [4,4.5,5]}

# grid_search_result = grid_model.grid_search(grid, train_pool, plot=True, verbose=False)

In [None]:
# grid_search_result['params']

In [None]:
# evaluate(grid_model, X_validation, y_validation)

In this section, we'll select these parameters using the **`optuna`** 

In [None]:
# !pip3 install optuna

In [None]:
# import optuna

In [None]:
SAMPLE_RATE = 0.4
RANDOM_SEED = 1
EARLY_STOPPING_ROUND = 100

In [None]:
def objective(trial):
    param = {}
    param['learning_rate'] = trial.suggest_discrete_uniform("learning_rate", 0.001, 0.02, 0.001)
    param['depth'] = trial.suggest_int('depth', 9, 15)
    param['l2_leaf_reg'] = trial.suggest_discrete_uniform('l2_leaf_reg', 1.0, 5.5, 0.5)
    param['min_child_samples'] = trial.suggest_categorical('min_child_samples', [1, 4, 8, 16, 32])
    param['grow_policy'] = 'Depthwise'
    param['iterations'] = 10000
    param['use_best_model'] = True
    param['eval_metric'] = 'RMSE'
    param['od_type'] = 'iter'
    param['od_wait'] = 20
    param['random_state'] = RANDOM_SEED
    param['logging_level'] = 'Silent'
    
    regressor = CatBoostRegressor(**param)

    regressor.fit(X_train.copy(), y_train.copy(),
                  eval_set=[(X_validation.copy(), y_validation.copy())],
                  early_stopping_rounds=EARLY_STOPPING_ROUND,cat_features=cat_features)
    loss = mean_squared_error(y_validation, regressor.predict(X_validation.copy()))
    return loss


In [None]:
# %%time
# study = optuna.create_study(study_name=f'catboost-seed{RANDOM_SEED}')
# study.optimize(objective, n_trials=10000, n_jobs=-1, timeout=24000)

In [None]:
# study.best_value

In [None]:
# study.best_params

In [None]:
# import json
# with open('best_params_v1.json', 'w') as f:
#   json.dump(study.best_params, f)

In [None]:
# optuna.visualization.plot_optimization_history(study)

In [None]:
# optuna.visualization.plot_slice(study)

In [None]:
# optuna.visualization.plot_contour(study, params=['learning_rate',
#                                                  'min_child_samples',
#                                                  'depth',
#                                                  'l2_leaf_reg'])

In [None]:
# optuna.visualization.plot_param_importances(study)

In [None]:
# optuna.visualization.plot_edf(study)

In [None]:
%%time
optimized_regressor = CatBoostRegressor(learning_rate=0.015, 
                                        depth=15, 
                                        l2_leaf_reg=4.0, 
                                        min_child_samples=1,
                                        grow_policy='Depthwise',
                                        use_best_model=True,
                                        eval_metric='RMSE',
                                        od_type='iter',
                                        od_wait=20,
                                        random_state=RANDOM_SEED,)
optimized_regressor.fit(X_train.copy(), y_train.copy(),
                        eval_set=[(X_validation.copy(), y_validation.copy())],
                        early_stopping_rounds=EARLY_STOPPING_ROUND,cat_features=cat_features,
                                        verbose=200,
                                        plot=True)
pred_train = optimized_regressor.predict(X_train.copy())

In [None]:
print('optimized model validation R2 Score: {:.4}'.format(
    metrics.r2_score(y_validation, optimized_regressor.predict(X_validation))
))

In [None]:
model_cv(optimized_regressor)

In [None]:
feature_importance(optimized_regressor)

In [None]:
submit(optimized_regressor, X_test, Property_IDs, 'submitioncatboost.csv')