In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/52/39/128fff65072c8327371e3c594f3c826d29c85b21cb6485980353b168e0e4/catboost-0.24.2-cp36-none-manylinux1_x86_64.whl (66.1MB)
[K     |████████████████████████████████| 66.2MB 53kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.24.2


In [3]:
import pandas as pd

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from sklearn.linear_model import LinearRegression, HuberRegressor, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn_pandas import DataFrameMapper
from sklearn.metrics import mean_squared_error

from catboost import CatBoostRegressor

In [5]:
df = pd.read_csv('gdrive/My Drive/Colab Notebooks/data/train.csv')

In [6]:
df.head(1)

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1


In [9]:
df = df.drop(columns="id")

In [10]:
target = "Response"
X = df.drop(target, axis=1)
y = df[target]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [14]:
mapper = DataFrameMapper([
    ("Gender", LabelBinarizer()),
    (["Age"], StandardScaler()),
    (["Driving_License"], StandardScaler()),
    (["Region_Code"], StandardScaler()),
    (["Previously_Insured"], StandardScaler()),
    ("Vehicle_Age", LabelBinarizer()),
    ("Vehicle_Damage", LabelBinarizer()),
    (["Annual_Premium"], StandardScaler()),
    (["Policy_Sales_Channel"], StandardScaler()),
    (["Vintage"], StandardScaler())],df_out=True)

In [15]:
Z_train = mapper.fit_transform(X_train)

In [16]:
Z_test = mapper.transform(X_test)

In [18]:
model = LinearRegression().fit(Z_train,y_train)
print("LinearRegression train score is " + str(model.score(Z_train,y_train)))
print("LinearRegression test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

LinearRegression train score is 0.1465940226198289
LinearRegression test score is 0.1504408390232953
Mean squared error is 0.3032335076743192


In [19]:
model = HuberRegressor().fit(Z_train,y_train)
print("HuberRegressor train score is " + str(model.score(Z_train,y_train)))
print("HuberRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

HuberRegressor train score is -0.13508884235532403
HuberRegressor test score is -0.13672428795065272
Mean squared error is 0.3507582912963718


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [20]:
model = Lasso().fit(Z_train,y_train)
print("Lasso train score is " + str(model.score(Z_train,y_train)))
print("Lasso test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Lasso train score is 0.0
Lasso test score is -1.382410017258806e-05
Mean squared error is 0.3289905155361102


In [21]:
model = Ridge().fit(Z_train,y_train)
print("Ridge train score is " + str(model.score(Z_train,y_train)))
print("Ridge test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Ridge train score is 0.14659402257305032
Ridge test score is 0.15044074636863514
Mean squared error is 0.30323352420995214


In [22]:
model = ElasticNet().fit(Z_train,y_train)
print("ElasticNet train score is " + str(model.score(Z_train,y_train)))
print("ElasticNet test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

ElasticNet train score is 0.0
ElasticNet test score is -1.382410017258806e-05
Mean squared error is 0.3289905155361102


In [23]:
model = RandomForestRegressor().fit(Z_train,y_train)
print("RandomForestRegressor train score is " + str(model.score(Z_train,y_train)))
print("RandomForestRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

RandomForestRegressor train score is 0.8767594355659685
RandomForestRegressor test score is 0.12989906648825744
Mean squared error is 0.3068776030398658


In [24]:
model = DecisionTreeRegressor().fit(Z_train,y_train)
print("DecisionTreeRegressor train score is " + str(model.score(Z_train,y_train)))
print("DecisionTreeRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

DecisionTreeRegressor train score is 0.9995109659311313
DecisionTreeRegressor test score is -0.6374944264449736
Mean squared error is 0.4209885569205423


In [25]:
model = AdaBoostRegressor().fit(Z_train,y_train)
print("AdaBoostRegressor train score is " + str(model.score(Z_train,y_train)))
print("AdaBoostRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

AdaBoostRegressor train score is -0.015650227308333875
AdaBoostRegressor test score is -0.005017630748027857
Mean squared error is 0.32981257955702753


In [26]:
model = CatBoostRegressor().fit(Z_train,y_train)
print("CatBoostRegressor train score is " + str(model.score(Z_train,y_train)))
print("CatBoostRegressor test score is " + str(model.score(Z_test,y_test)))

print("Mean squared error is " + str(mean_squared_error(y_test, model.predict(Z_test))**(1/2)))

Learning rate set to 0.106804
0:	learn: 0.3218531	total: 109ms	remaining: 1m 48s
1:	learn: 0.3172275	total: 150ms	remaining: 1m 15s
2:	learn: 0.3135141	total: 199ms	remaining: 1m 6s
3:	learn: 0.3103225	total: 246ms	remaining: 1m 1s
4:	learn: 0.3077504	total: 294ms	remaining: 58.5s
5:	learn: 0.3056792	total: 339ms	remaining: 56.1s
6:	learn: 0.3040465	total: 384ms	remaining: 54.5s
7:	learn: 0.3027018	total: 433ms	remaining: 53.7s
8:	learn: 0.3015721	total: 477ms	remaining: 52.6s
9:	learn: 0.3007063	total: 518ms	remaining: 51.3s
10:	learn: 0.3000043	total: 564ms	remaining: 50.8s
11:	learn: 0.2994047	total: 611ms	remaining: 50.3s
12:	learn: 0.2989016	total: 657ms	remaining: 49.9s
13:	learn: 0.2984619	total: 703ms	remaining: 49.5s
14:	learn: 0.2980605	total: 745ms	remaining: 48.9s
15:	learn: 0.2977532	total: 787ms	remaining: 48.4s
16:	learn: 0.2974388	total: 830ms	remaining: 48s
17:	learn: 0.2971988	total: 880ms	remaining: 48s
18:	learn: 0.2970329	total: 920ms	remaining: 47.5s
19:	learn: 0.

In [27]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.utils import to_categorical

In [41]:
model = Sequential([
    Input(shape=(Z_train.shape[1],)),
    Dense(32, activation='elu'),
    Dropout(.05),
    Dense(16, activation='elu'),
    Dropout(.05),
    Dense(8, activation='elu'),
    Dropout(.05),
    Dense(1, activation='elu')
])

model.compile(loss='mae', optimizer='adam', metrics=[tf.keras.metrics.BinaryAccuracy()])

In [42]:
history = model.fit(Z_train, y_train,
                    validation_data=(Z_test, y_test),
                    epochs=10, batch_size=32,
                    verbose=2)

model.summary()

Epoch 1/10
8933/8933 - 11s - loss: 0.1248 - binary_accuracy: 0.8777 - val_loss: 0.1235 - val_binary_accuracy: 0.8765
Epoch 2/10
8933/8933 - 11s - loss: 0.1223 - binary_accuracy: 0.8777 - val_loss: 0.1235 - val_binary_accuracy: 0.8765
Epoch 3/10
8933/8933 - 11s - loss: 0.1223 - binary_accuracy: 0.8777 - val_loss: 0.1236 - val_binary_accuracy: 0.8765
Epoch 4/10
8933/8933 - 11s - loss: 0.1223 - binary_accuracy: 0.8777 - val_loss: 0.1235 - val_binary_accuracy: 0.8765
Epoch 5/10
8933/8933 - 11s - loss: 0.1223 - binary_accuracy: 0.8777 - val_loss: 0.1235 - val_binary_accuracy: 0.8765
Epoch 6/10
8933/8933 - 11s - loss: 0.1223 - binary_accuracy: 0.8777 - val_loss: 0.1236 - val_binary_accuracy: 0.8765
Epoch 7/10
8933/8933 - 11s - loss: 0.1223 - binary_accuracy: 0.8777 - val_loss: 0.1236 - val_binary_accuracy: 0.8765
Epoch 8/10
8933/8933 - 11s - loss: 0.1223 - binary_accuracy: 0.8777 - val_loss: 0.1235 - val_binary_accuracy: 0.8765
Epoch 9/10
8933/8933 - 11s - loss: 0.1223 - binary_accuracy: 0.8