In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import norm, skew 

In [3]:
df = pd.read_csv("/kaggle/input/phone-numbers/preprocessed_with_features_rond1000.csv")

In [4]:
df.head()

In [5]:
df.drop(['phone_number' , 'Unnamed: 0'] , axis=1 , inplace=True)

In [6]:
df.info()

# Target variable

## Distribution

In [7]:
def plot_distribution_info(data):
    sns.distplot(data , fit=norm)
    # Get the fitted parameters used by the function
    (mu, sigma) = norm.fit(data)
    print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

    #Now plot the distribution
    plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
    plt.ylabel('Frequency')
    plt.title('Price distribution')

    #Get also the QQ-plot
    fig = plt.figure()
    res = stats.probplot(data, plot=plt)
    plt.show()

In [8]:
plot_distribution_info(df['price'])

## Log transformation

In [9]:
df["price"] = np.log1p(df["price"])

In [10]:
#Check the new distribution 
plot_distribution_info(df['price'])

# Correlation map

In [11]:
#Correlation map to see how features are correlated with price
corrmat = df.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

# Modeling

## Import libraries

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
import lightgbm as lgb

## Split data

In [13]:
y = df.price
X = df.drop('price' , axis=1)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Cross validation strategy

In [15]:
n_folds = 5
def rmse_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

## Base models

### Lasso

In [16]:
lasso = Lasso(alpha =0.0005, random_state=1)
score = rmse_cv(lasso)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

### Elastic Net

In [17]:
ENet = ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)
score = rmse_cv(ENet)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

### Random Forest

In [18]:
# Create a based model
rf = RandomForestRegressor(bootstrap=False , max_depth=100 , max_features = 5 , min_samples_leaf=5 ,
                           min_samples_split = 10 , n_estimators = 500 , n_jobs = -1)

score = rmse_cv(rf)
print("Random forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


## Decision tree

In [19]:
decision_tree = DecisionTreeRegressor(max_depth=100 , max_features = 5 , min_samples_leaf=5 ,
                           min_samples_split = 10)
score = rmse_cv(decision_tree)
print("Decision tree score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [20]:
from sklearn.model_selection import GridSearchCV
param_grid = {
              "min_samples_split": [7,10, 15],
              "max_depth": [8 , 10 , 12],
              "min_samples_leaf": [20, 40, 100],
              "max_leaf_nodes": [6, 9, 12],
              }


grid_cv_dtm = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)

grid_cv_dtm.fit(X_train,y_train)

In [21]:
print("Best score::{}".format(grid_cv_dtm.best_score_))
print("Best hyperparameters::\n{}".format(grid_cv_dtm.best_params_))

## AdaBoost with decision tree

In [None]:
from sklearn.ensemble import AdaBoostRegressor
adaptive_reg = AdaBoostRegressor(
    DecisionTreeRegressor(max_depth= 10, max_leaf_nodes= 12, min_samples_leaf= 20, min_samples_split=10), n_estimators=500 ,learning_rate=0.03
)
score = rmse_cv(adaptive_reg)
print("Adaptive Boost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [None]:
search_grid={'n_estimators':[300,600],'learning_rate':[.001, 0.003,0.01,0.03],'random_state':[42]}
search=GridSearchCV(estimator=adaptive_reg,param_grid=search_grid,scoring='neg_mean_squared_error',n_jobs=-1,cv=5)
search.fit(X_train , y_train)
print("Best score::{}".format(search.best_score_))
print("Best hyperparameters::\n{}".format(search.best_params_))

## Gradient Boost methods

In [None]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=10, max_features='sqrt',
                                   min_samples_leaf=20, min_samples_split=10, 
                                   loss='huber', random_state =5)
score = rmse_cv(GBoost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [22]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.06, n_estimators=4000,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11 , device="gpu")
score = rmse_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n" .format(score.mean(), score.std()))

In [23]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=10, 
                             min_child_weight=1.7817, n_estimators=3200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1 , tree_method='gpu_hist')
score = rmse_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [25]:
from sklearn.metrics import mean_squared_error
from math import sqrt
model_xgb.fit(X_train , y_train)
predictions = model_xgb.predict(X_test)
rmse = sqrt(mean_squared_error(y_test,predictions))
rmse

## Ensemble methods

### Weighted average of base models

In [None]:
from sklearn.ensemble import VotingRegressor
averaged_models = VotingRegressor([("GB",GBoost) ,("XGB",model_xgb) , ("LGB",model_lgb) , ("ENET" , ENet)] , \
                                 weights=[4 , 3 , 2 , 1] , n_jobs=-1)

score = rmse_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

### Stacking models

In [None]:
from sklearn.ensemble import StackingRegressor
final_estimator = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
estimators = [
    ("XGB" , xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1 , tree_method='gpu_hist')),
    ("LGB" , lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11 , device="gpu")),
    ("Enet" , ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3)),
]
reg = StackingRegressor(
     estimators=estimators,
     final_estimator=final_estimator , n_jobs=-1)

In [None]:
score = rmse_cv(reg)
print(" Stacked base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))