# Kaggle

## First Look
* 400 samples
* Drop first five features for colinearity

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# preprocessing
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split

# metrics
from sklearn.metrics import r2_score, mean_squared_error
import statsmodels.api as sm
from scipy import stats

# models
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import LinearSVR
from sklearn.neural_network import MLPRegressor

In [2]:
data = pd.read_csv('train.csv', index_col='id')

In [55]:
X = data.drop(axis=1, columns=['target'])
y = data[['target']]

## Changes

## Ridge model with f1, f3, f4, f5

In [56]:
#X.drop(axis=1, columns=['f1', 'f2', 'f3', 'f4', 'f5', ], inplace=True)
X = X[['f1', 'f3', 'f4', 'f5']]

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [90]:
# pipeline
pipe = Pipeline(steps=[
    ('scaling', StandardScaler()),
    ('poly_features', PolynomialFeatures(degree=3)),
    ('reg', Ridge())])

# grid
params = {'reg__alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
grid = GridSearchCV(pipe,
                    params,
                    scoring='neg_mean_squared_error',
                    cv = 5)
grid.fit(X_train, y_train)

In [91]:
best_ridge = grid.best_estimator_
grid.best_params_

{'reg__alpha': 2}

In [92]:
y_test_pred = best_ridge.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_test_pred))

1.228849767910492

In [93]:
y_train_pred = best_ridge.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_train_pred))

1.1541055922539745

In [94]:
testing = pd.read_csv('test.csv', index_col='id')
testing = testing[['f1', 'f3', 'f4', 'f5']]
testing['target'] = best_ridge.predict(testing)
testing

Unnamed: 0_level_0,f1,f3,f4,f5,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
401,0.963814,-0.041354,-1.565599,1.737945,-34.736121
402,1.118852,0.311665,-1.129158,1.737983,-7.211435
403,1.063174,0.329134,0.857807,1.737917,-0.724986
404,1.176543,0.325012,-0.708241,1.737978,-4.438960
405,0.741057,0.503625,1.111256,1.737958,2.717496
...,...,...,...,...,...
1196,0.882169,0.421486,0.737417,1.737958,1.237348
1197,0.842735,0.391464,1.233258,1.737914,1.817717
1198,1.270291,0.778258,2.607416,1.737807,24.133697
1199,0.511655,0.397267,0.398529,1.737963,-3.018166


In [95]:
testing[['target']].to_csv('submission.csv')

## Ridge Reg with f3, f4, f5

In [32]:
X = X[['f3', 'f4', 'f5']]

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [38]:
# pipeline
pipe = Pipeline(steps=[
    ('scaling', StandardScaler()),
    ('poly_features', PolynomialFeatures()),
    ('reg', Ridge())])

# grid
params = {'poly_features__degree': [1, 2, 3, 4, 5],
         'reg__alpha': [0.001, 0.01, 0.05, 0.1, 0.5, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
grid = GridSearchCV(pipe,
                    params,
                    scoring='neg_mean_squared_error',
                    cv = 10)
grid.fit(X_train, y_train)

best_ridge = grid.best_estimator_
grid.best_params_

{'poly_features__degree': 3, 'reg__alpha': 2}

In [39]:
y_test_pred = best_ridge.predict(X_test)
np.sqrt(mean_squared_error(y_test, y_test_pred))

2.9528424936523647

In [40]:
y_train_pred = best_ridge.predict(X_train)
np.sqrt(mean_squared_error(y_train, y_train_pred))

1.9321143299121353

In [41]:
best_ridge['reg'].coef_

array([[ 0.        ,  3.17874629,  1.15729085, -0.77440378, -1.70427584,
        -0.13818864,  0.01593255, -0.03481982,  0.09780445, -0.12029859,
         0.52247503,  0.04722708, -0.12344303,  0.04348462,  0.04817923,
        -0.03827845,  1.10709289, -0.03836948, -0.06286722, -1.81392225]])