In [125]:
import numpy as np
import pandas as pd

# pipeline utilities
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# preprocessing
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, OneHotEncoder # tuning: PolynomialFeatures
from sklearn.metrics import make_scorer, mean_squared_error

# validation
from sklearn.model_selection import KFold, cross_val_score

# models
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV, SGDRegressor, BayesianRidge, ARDRegression, RANSACRegressor
from sklearn.svm import SVR # todo define kernel
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor 

In [110]:
# Load data
df = pd.read_csv("../input/30-days-of-ml/train.csv", index_col=0)
df

Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,B,B,B,C,B,B,A,E,C,N,...,0.400361,0.160266,0.310921,0.389470,0.267559,0.237281,0.377873,0.322401,0.869850,8.113634
2,B,B,A,A,B,D,A,F,A,O,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
3,A,A,A,C,B,D,A,D,A,F,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
4,B,B,A,C,B,D,A,E,C,K,...,0.668980,0.239061,0.732948,0.679618,0.574844,0.346010,0.714610,0.540150,0.280682,8.049253
6,A,A,A,C,B,D,A,E,A,N,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.972260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499993,B,B,A,A,B,D,A,E,A,I,...,0.769792,0.450538,0.934360,1.005077,0.853726,0.422541,1.063463,0.697685,0.506404,7.945605
499996,A,B,A,C,B,B,A,E,E,F,...,0.528056,0.508502,0.358247,0.257825,0.433525,0.301015,0.268447,0.577055,0.823611,7.326118
499997,B,B,A,C,B,C,A,E,G,F,...,0.688747,0.372425,0.364936,0.383224,0.551825,0.661007,0.629606,0.714139,0.245732,8.706755
499998,A,B,A,C,B,B,A,E,E,I,...,0.344404,0.424243,0.382028,0.468819,0.351036,0.288768,0.611169,0.380254,0.332030,7.229569


In [111]:
# work on a random sample to speed-up process
df_sample = df.sample(5000, random_state=123)
X = df_sample.drop(['target'], axis=1)
y = df_sample['target']

# X = df.drop(['target'], axis=1)
# y = df['target']
print(X.shape)
print(y.shape)

(5000, 24)
(5000,)


In [112]:
# Select numerical features
numFeatures = X.select_dtypes(exclude=['object']).columns
# Select Categorical features
catFeatures = X.select_dtypes(include=['object']).columns
print(numFeatures)
print(catFeatures)

Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13'],
      dtype='object')
Index(['cat0', 'cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8',
       'cat9'],
      dtype='object')


In [126]:
# Pipeline for transforming categorical variables
catTransformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Pipeline for scaling numerical variables
numTransformer = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('power_transformer', PowerTransformer()),
])

# Create the preprocessing engine
preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numTransformer, numFeatures),
        ('categoric', catTransformer, catFeatures),
    ]
)

# Create a list of the models, with default parameters
models = [
    DummyRegressor(strategy='mean'),
    LinearRegression(), 
    ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], random_state=123), 
    SGDRegressor(random_state=123), 
    BayesianRidge(), 
    ARDRegression(), 
    RANSACRegressor(base_estimator=ElasticNet(alpha=0.005, l1_ratio=1), random_state=123),
    SVR(kernel='linear'),
    SVR(kernel='poly'),
    SVR(kernel='rbf'),
    SVR(kernel='sigmoid'),
    KNeighborsRegressor(),
    DecisionTreeRegressor(random_state=123),
    RandomForestRegressor(random_state=123), 
    AdaBoostRegressor(base_estimator=ElasticNet(alpha=0.005, l1_ratio=1), random_state=123), 
    GradientBoostingRegressor(random_state=123), 
    XGBRegressor(random_state=123),
]

In [127]:
# perform cross-validation on each model

scores = {}

scorer = make_scorer(score_func=mean_squared_error, greater_is_better=False)

for model in models:    
    estimator = Pipeline(
        steps=[
            ('preprocessor', preprocessor),
            ('model', model)
        ]
    )
    kf = KFold(n_splits=5)
    results = cross_val_score(estimator, X, y, cv=kf, scoring=scorer)

    print(model.__class__.__name__)
    print(f"model score: {abs(results.mean()):.4f}")
    
    scores[model.__class__.__name__] = abs(results.mean())

DummyRegressor
model score: 0.5640
LinearRegression
model score: 105988386921871687680.0000
ElasticNetCV
model score: 0.5565
SGDRegressor
model score: 0.5704
BayesianRidge
model score: 0.5569
ARDRegression
model score: 0.5570
RANSACRegressor
model score: 0.6486
SVR
model score: 0.5661
SVR
model score: 0.6335
SVR
model score: 0.5907
SVR
model score: 458.6612
KNeighborsRegressor
model score: 0.6492
DecisionTreeRegressor
model score: 1.1378
RandomForestRegressor
model score: 0.5615
AdaBoostRegressor
model score: 0.7421
GradientBoostingRegressor
model score: 0.5562
XGBRegressor
model score: 0.6100


In [129]:
# print scores
sorted_scores = {m: s for m, s in sorted(scores.items(), key=lambda item: item[1])}
for model, score in sorted_scores.items():
    print(f'{model} --> {score}')

GradientBoostingRegressor --> 0.5562108178241163
ElasticNetCV --> 0.5564595600329934
BayesianRidge --> 0.5568741317933983
ARDRegression --> 0.5570491582991994
RandomForestRegressor --> 0.561482477221702
DummyRegressor --> 0.5640294081883237
SGDRegressor --> 0.5704233518544323
XGBRegressor --> 0.610037561685146
RANSACRegressor --> 0.6485560496753584
KNeighborsRegressor --> 0.6492486073878471
AdaBoostRegressor --> 0.7420632107228471
DecisionTreeRegressor --> 1.1378460623056512
SVR --> 458.66122050520255
LinearRegression --> 1.0598838692187169e+20


Linear Regression does not find usable coefficients without regularization so we discard.

We keep models that had better scores than the dummy model plus we keep XGBoost for further tuning.

Model list for next round:
- GradientBoostingRegressor 
- ElasticNetCV
- BayesianRidge
- ARDRegression
- RandomForestRegressor
- SVR with linear kernel
- XGBRegressor

## Make submission using winner of this round: GradientBoostingRegressor

In [131]:
# load test data
test_df = pd.read_csv("../input/30-days-of-ml/test.csv", index_col=0)
# split dataset
X_train = df.drop(['target'], axis=1)
y_train = df['target']
X_test = test_df
# train model
model = GradientBoostingRegressor(random_state=123)
estimator = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ]
)
estimator.fit(X_train, y_train)
# predict
y_pred = estimator.predict(X_test)
# save the predictions to a CSV file
output = pd.DataFrame({'Id': X_test.index,
                       'target': y_pred})
output.to_csv('submission.csv', index=False)