In [None]:
import xgboost as xg
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Data Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.metrics import mean_squared_error as MSE


In [3]:
# Load the data
dataset = pd.read_csv("data_train_mod.csv")
train_X, train_y = dataset.iloc[:, 1:-1], dataset.iloc[:, -1]

test = pd.read_csv("data_test_mod.csv")
test_X = test.iloc[:,1:]


In [57]:
# Instantiation
xgb_r = xg.XGBRegressor(objective='reg:squarederror',
                        n_estimators=5, seed=100)

# Fitting the model
xgb_r.fit(train_X, train_y)


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=5, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=100,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=100,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [58]:

# Predict the model
pred = xgb_r.predict(test_X)


In [59]:
pred

array([0.12367173, 0.10616723, 0.13469036, ..., 0.12832355, 0.12208302,
       0.13932623], dtype=float32)

In [60]:
ans = pd.DataFrame({'index': test.iloc[:, 0], 'y': pred})

In [61]:
ans.describe()

Unnamed: 0,index,y
count,22605.0,22605.0
mean,33908.0,0.181038
std,6525.645753,0.114384
min,22606.0,0.070505
25%,28257.0,0.123672
50%,33908.0,0.144076
75%,39559.0,0.175733
max,45210.0,0.915011


In [62]:
ans.to_csv("submission.csv", index= False)

In [9]:
# evaluate an xgboost regression model on the housing dataset
from numpy import absolute
from pandas import read_csv
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from xgboost import XGBRegressor

In [44]:
dataset = pd.read_csv("data_train_mod.csv")
X, y = dataset.iloc[:, 1:-1], dataset.iloc[:, -1]

test = pd.read_csv("data_test_mod.csv")
test_X = test.iloc[:,1:]

In [45]:
model = XGBRegressor()
# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(
    model, X, y, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (scores.mean(), scores.std()))


Mean MAE: 0.175 (0.005)


In [46]:
model = XGBRegressor()
# fit model
model.fit(X, y)
ans = model.predict(test_X)


In [47]:
ans

array([0.07007038, 0.00324776, 0.1698162 , ..., 0.09992715, 0.03206109,
       0.04611201], dtype=float32)

In [19]:
ans.min()

-0.4380621

In [1]:
from sklearn import linear_model
model = linear_model.LinearRegression()


In [4]:
model.fit(train_X, train_y)


LinearRegression()

In [5]:
pred_y = model.predict(test_X)

In [6]:
ans = pd.DataFrame({'index': test.iloc[:, 0], 'y': pred_y})
ans.to_csv("submission_linear_regression.csv", index= False)

In [42]:
from sklearn import ensemble

In [10]:
params = {
    "n_estimators": 500,
    "max_depth": 4,
    "min_samples_split": 5,
    "learning_rate": 0.01,
    "loss": "squared_error"
}


In [11]:
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(train_X, train_y)

pred_y_gbr = reg.predict(test_X)    


In [40]:
grid = {
    "n_estimators": [500, 1000, 1500],
    "max_depth": [4, 5, 6, 9],
    "min_samples_split": [5, 10, 15],
    "learning_rate": [0.01, 0.05, 0.1],
    "loss": ["huber", "squared_error"]
}

In [45]:
gbr = ensemble.GradientBoostingRegressor()

In [47]:
from sklearn.model_selection import GridSearchCV
cv = GridSearchCV(gbr,grid,cv=5)
cv.fit(train_X,train_y.ravel())

In [12]:
ans_gbr = pd.DataFrame({'index': test.iloc[:, 0], 'y': pred_y_gbr})
ans_gbr.to_csv("submission_gbr.csv", index= False) 

In [25]:

import catboost as cb
from sklearn.metrics import classification_report


In [15]:
X_train, X_test, y_train, y_test = train_test_split(train_X,
                                                    train_y,
                                                    test_size=0.2,
                                                    random_state=101,
                                                    stratify=train_y)


In [19]:
categorical_indicies = ['job', 'marital', 'education', 'default', 'housing','loan', 'contact', 'poutcome']

In [20]:
train_dataset = cb.Pool(X_train, y_train,
                        cat_features=categorical_indicies)
test_dataset = cb.Pool(X_test, y_test,
                       cat_features=categorical_indicies)


In [35]:
# model = cb.CatBoostClassifier(loss_function='Logloss',
#                               eval_metric='Accuracy')
model = cb.CatBoostRegressor(loss_function='RMSE')


In [36]:
grid = {'learning_rate': [0.03, 0.1],
        'depth': [4, 6, 10],
        'l2_leaf_reg': [1, 3, 5, ],
        'iterations': [50, 100, 150]}


In [37]:
model.grid_search(grid,train_dataset)

0:	learn: 0.3369151	test: 0.3414676	best: 0.3414676 (0)	total: 47.6ms	remaining: 2.33s
1:	learn: 0.3349349	test: 0.3394910	best: 0.3394910 (1)	total: 96.3ms	remaining: 2.31s
2:	learn: 0.3329093	test: 0.3375139	best: 0.3375139 (2)	total: 132ms	remaining: 2.07s
3:	learn: 0.3310499	test: 0.3356755	best: 0.3356755 (3)	total: 166ms	remaining: 1.91s
4:	learn: 0.3291758	test: 0.3338558	best: 0.3338558 (4)	total: 195ms	remaining: 1.75s
5:	learn: 0.3273555	test: 0.3321069	best: 0.3321069 (5)	total: 216ms	remaining: 1.58s
6:	learn: 0.3256897	test: 0.3305019	best: 0.3305019 (6)	total: 235ms	remaining: 1.44s
7:	learn: 0.3242215	test: 0.3290688	best: 0.3290688 (7)	total: 267ms	remaining: 1.4s
8:	learn: 0.3227355	test: 0.3275993	best: 0.3275993 (8)	total: 364ms	remaining: 1.66s
9:	learn: 0.3213422	test: 0.3262669	best: 0.3262669 (9)	total: 449ms	remaining: 1.8s
10:	learn: 0.3200133	test: 0.3249779	best: 0.3249779 (10)	total: 524ms	remaining: 1.86s
11:	learn: 0.3187349	test: 0.3237334	best: 0.3237334

{'params': {'depth': 10,
  'l2_leaf_reg': 5,
  'iterations': 150,
  'learning_rate': 0.1},
 'cv_results': defaultdict(list,
             {'iterations': [0,
               1,
               2,
               3,
               4,
               5,
               6,
               7,
               8,
               9,
               10,
               11,
               12,
               13,
               14,
               15,
               16,
               17,
               18,
               19,
               20,
               21,
               22,
               23,
               24,
               25,
               26,
               27,
               28,
               29,
               30,
               31,
               32,
               33,
               34,
               35,
               36,
               37,
               38,
               39,
               40,
               41,
               42,
               43,
               44,
               45

In [38]:
pred_cb = model.predict(test_X)


In [39]:
ans_cb = pd.DataFrame({'index': test.iloc[:, 0], 'y': pred_cb})
ans_cb.to_csv("submission_cb.csv", index= False)