###Importing necessary libraries

In [4]:
#installing optuna
!pip install --quiet optuna
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [26]:
import warnings
warnings.filterwarnings("ignore")
#data manipulation libraries
import numpy as np 
import pandas as pd 
from tqdm import tqdm
#visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
#model building libraries
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split, cross_val_predict
import lightgbm as lgb
from xgboost import XGBClassifier
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import StandardScaler


QT = True # True if wish to implement quartile transformation to data
BASIC = True # True if wish to use basic hyperparameters for models
RANDOM = 42
TUNE = False # Whether to tune hyperparameters or not

###Importing the dataset

In [6]:
#mounting the drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
%cd /content/drive/MyDrive/hackathon/cat boost and lightgbm/catboost

/content/drive/MyDrive/hackathon/cat boost and lightgbm/catboost


In [8]:
# # Authenticate and create the PyDrive client.
# auth.authenticate_user()
# gauth = GoogleAuth()
# gauth.credentials = GoogleCredentials.get_application_default()
# drive = GoogleDrive(gauth)

######Train data

In [9]:
#reading the train datset
train = pd.read_csv('train.csv')

In [10]:
#confirming the imported dataset
train.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


- The  train dataset has been imported

######Test data

###Data observations and sanity checks

**Shape of the datasets**

In [11]:
#shape of train dataset
train.shape

(4459, 4993)

- The dataset has 4459 rows and 4993 columns

**Datatypes of the columns**

In [12]:
#checking datatypes of train columns
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4459 entries, 0 to 4458
Columns: 4993 entries, ID to 9fc776466
dtypes: float64(1845), int64(3147), object(1)
memory usage: 169.9+ MB


- There are 1845 columns with float data types, 3147 columns with int datatypes and 1 object datatype

**Checking missing values**

In [13]:
#checking missing values of train columns
train.isnull().sum()

ID           0
target       0
48df886f9    0
0deb4b6a8    0
34b15f335    0
            ..
71b203550    0
137efaa80    0
fb36b89d9    0
7e293fbaf    0
9fc776466    0
Length: 4993, dtype: int64

- There are no missing values in the train dataset

**Duplicated values**

In [14]:
#checking duplicated values on the train dataset
train.duplicated().sum()

0

**Checking statistical summery**

In [15]:
#checking statistical summery
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target,4459.0,5.944923e+06,8.234312e+06,30000.0,600000.0,2260000.0,8000000.0,40000000.0
48df886f9,4459.0,1.465493e+04,3.893298e+05,0.0,0.0,0.0,0.0,20000000.0
0deb4b6a8,4459.0,1.390895e+03,6.428302e+04,0.0,0.0,0.0,0.0,4000000.0
34b15f335,4459.0,2.672245e+04,5.699652e+05,0.0,0.0,0.0,0.0,20000000.0
a8cb14b00,4459.0,4.530164e+03,2.359124e+05,0.0,0.0,0.0,0.0,14800000.0
...,...,...,...,...,...,...,...,...
71b203550,4459.0,1.213809e+05,4.720709e+06,0.0,0.0,0.0,0.0,301312000.0
137efaa80,4459.0,3.573451e+04,1.614622e+06,0.0,0.0,0.0,0.0,106420000.0
fb36b89d9,4459.0,3.123741e+05,4.318501e+06,0.0,0.0,0.0,0.0,140000000.0
7e293fbaf,4459.0,9.219960e+04,1.635993e+06,0.0,0.0,0.0,0.0,61768000.0


- Apart from the target column, other columns do not have the 25, 50 and 75 percent value.

###EDA

###Feature engineerning

In [16]:
#copying the datasets
train11 = train.copy()
train11.head()

Unnamed: 0,ID,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,000d6aaf2,38000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,000fbd867,600000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,0027d6b71,10000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,0028cbf45,2000000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,002a68644,14400000.0,0.0,0,0.0,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


- The dataset has been copied.

###Defining the dependent and independent variable

In [17]:
#dropping ID COLUMN
train11 = train11.drop(['ID'], axis = 1)
train11.head()

Unnamed: 0,target,48df886f9,0deb4b6a8,34b15f335,a8cb14b00,2f0771a37,30347e683,d08d1fbe3,6ee66e115,20aa07010,...,3ecc09859,9281abeea,8675bec0b,3a13ed79a,f677d4d13,71b203550,137efaa80,fb36b89d9,7e293fbaf,9fc776466
0,38000000.0,0.0,0,0.0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
1,600000.0,0.0,0,0.0,0,0,0,0,0,2200000.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
2,10000000.0,0.0,0,0.0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
3,2000000.0,0.0,0,0.0,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
4,14400000.0,0.0,0,0.0,0,0,0,0,0,2000000.0,...,0.0,0.0,0.0,0,0,0,0,0,0,0


In [18]:
#scale numerical features 
x = train11.drop(columns=['target'], axis = 1)
y = train11.pop('target')

# define standard scaler
scaler = StandardScaler()

# transform data
x = scaler.fit_transform(x)


###splitting the dataset

In [19]:
#splitting the dataset
x_train,x_val,y_train, y_val = train_test_split( x, y, 
                                                test_size = 0.3, random_state = 1 )

In [20]:
#getting the rows on the train and validation set
print('The shape of the train dataset is: ', x_train.shape)
print('The shape of the validation dataset is: ', x_val.shape)

The shape of the train dataset is:  (3121, 4991)
The shape of the validation dataset is:  (1338, 4991)


- The train set has 3121 rows while the validation set has 1338 rows. The number of columns is constant

###XGB Model

In [28]:
def run_xgb(x_train, y_train, x_val, y_val):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.001,
          'max_depth': 10, 
          'subsample': 0.6, 
          'colsample_bytree': 0.6,
          'alpha':0.001,
          'random_state': 42, 
          'silent': True}
    
    train_data = xgb.DMatrix(x_train, y_train)
    val_data = xgb.DMatrix(x_val, y_val)
    
    watchlist = [(train_data, 'train'), (val_data, 'valid')]
    
    model_xgb = xgb.train(params, train_data, 2000, watchlist, maximize=False, early_stopping_rounds = 100, verbose_eval=100)
    
    
    return  model_xgb

In [29]:
model_xgb = run_xgb(x_train,y_train,x_val,y_val)


Parameters: { "silent" } are not used.

[0]	train-rmse:10324993.51228	valid-rmse:9729679.98162
[100]	train-rmse:9800298.09377	valid-rmse:9302640.60444
[200]	train-rmse:9330067.79510	valid-rmse:8929436.90588
[300]	train-rmse:8909255.91868	valid-rmse:8606101.75284
[400]	train-rmse:8535620.81087	valid-rmse:8329421.73888
[500]	train-rmse:8203558.86792	valid-rmse:8095433.41056
[600]	train-rmse:7906639.35756	valid-rmse:7894469.87821
[700]	train-rmse:7640941.48040	valid-rmse:7722057.26339
[800]	train-rmse:7404703.76280	valid-rmse:7575915.91059
[900]	train-rmse:7189443.46491	valid-rmse:7451007.43190
[1000]	train-rmse:6997503.87358	valid-rmse:7345998.18850
[1100]	train-rmse:6819301.85294	valid-rmse:7258458.33849
[1200]	train-rmse:6664068.36210	valid-rmse:7184980.09808
[1300]	train-rmse:6520170.55412	valid-rmse:7123565.33107
[1400]	train-rmse:6387793.91093	valid-rmse:7072354.52594
[1500]	train-rmse:6270430.68305	valid-rmse:7029311.35616
[1600]	train-rmse:6161878.14053	valid-rmse:6993472.16803
[1

- The rsme of the train set is 5801024.92812 while the rsme of the validation set is 6902240.76964

###Light GradeintBoost

In [37]:
def run_lgb(x_train, y_train, x_val, y_val):
    params = {
        "objective" : "regression",
        "metric" : "rmse",
        "num_leaves" : 40,
        "learning_rate" : 0.004,
        "bagging_fraction" : 0.6,
        "feature_fraction" : 0.6,
        "bagging_frequency" : 6,
        "bagging_seed" : 42,
        "verbosity" : -1,
        "seed": 42
    }
    
    lgtrain = lgb.Dataset(x_train, label=y_train)
    lgval = lgb.Dataset(x_val, label=y_val)
    evals_result = {}
   
    

In [36]:
evals_result = run_lgb(x_train, y_train, x_val, y_val)

Training until validation scores don't improve for 100 rounds
[150]	training's rmse: 7.14686e+06	valid_1's rmse: 7.19519e+06
[300]	training's rmse: 6.39325e+06	valid_1's rmse: 6.90302e+06
[450]	training's rmse: 5.87498e+06	valid_1's rmse: 6.7646e+06
[600]	training's rmse: 5.47762e+06	valid_1's rmse: 6.71839e+06
[750]	training's rmse: 5.15324e+06	valid_1's rmse: 6.70751e+06
Early stopping, best iteration is:
[753]	training's rmse: 5.14725e+06	valid_1's rmse: 6.70731e+06


- The rsme of the train set is 5.14725e+06 while the rsme of the validation set is  6.70731e+06
- This is an improvement compared to XG boost model