# Imports

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
%matplotlib inline

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

In [5]:
from sklearn.linear_model import HuberRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, LassoLars
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import BaggingRegressor

In [11]:
rs = 101 # random state

# Data Load

In [7]:
train = pd.read_csv('train (1).csv')
test = pd.read_csv('test (1).csv')

In [9]:
X = train.drop(['radius','id'],axis=1)
y = train.radius

# Trying different models

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25,
                                                   random_state=rs)

## Linear Regression

In [13]:
lr = LinearRegression(n_jobs=-1)

In [14]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=-1, normalize=False)

In [15]:
y_pred_lr = lr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_lr)))

0.216593376451


## Ridge

In [16]:
rg = Ridge()

In [17]:
rg.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [18]:
y_pred_rg = rg.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_rg)))

0.216593376452


### Parametrization for Ridge

In [31]:
rg1 = Ridge(alpha=0.05, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=101, solver='auto', tol=0.001)

In [32]:
rg.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [33]:
y_pred_rg = rg.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_rg)))

0.216593376452


## LassoLars

In [37]:
ll = LassoLars(alpha=0.5)

In [38]:
ll.fit(X_train, y_train)

LassoLars(alpha=0.5, copy_X=True, eps=2.2204460492503131e-16,
     fit_intercept=True, fit_path=True, max_iter=500, normalize=True,
     positive=False, precompute='auto', verbose=False)

In [39]:
y_pred_ll = ll.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_ll)))

0.217256986367


## RandomForestRegressor

In [46]:
rf = RandomForestRegressor(random_state=rs, n_jobs=-1)

In [47]:
rf.fit(X_train,y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=101, verbose=0, warm_start=False)

In [48]:
y_pred_rf = rf.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_rf)))

0.139710745281


### GridSearchCV for RFR

In [49]:
params = {'n_estimators': [10,100,250,500,1000],
          'min_samples_split': [2,3,4],
          'max_features': [0.5,0.6,0.7,0.8,0.9,1],
          'bootstrap': [True,False]
}

In [50]:
clf = GridSearchCV(estimator=rf, param_grid=params, 
                   n_jobs=-1, cv=3, scoring="mean_squared_error", 
                   verbose=1)

In [51]:
clf.fit(X_train, y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   29.8s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  5.5min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  7.0min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
           oob_score=False, random_state=101, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [10, 100, 250, 500, 1000], 'min_samples_split': [2, 3, 4], 'max_features': [0.5, 0.6, 0.7, 0.8, 0.9, 1], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='mean_squared_error', verbose=1)

In [52]:
y_pred_gsrf = clf.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_gsrf)))

0.131715610499


{'bootstrap': True,
 'max_features': 0.7,
 'min_samples_split': 4,
 'n_estimators': 500}

## GradientBoostingRegressor

In [59]:
gbr = GradientBoostingRegressor(random_state=rs,criterion='mse')

In [60]:
gbr.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=101,
             subsample=1.0, verbose=0, warm_start=False)

In [61]:
y_pred_gbr = gbr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_gbr)))

0.160707780698


### GridSearch for GBR

In [72]:
params = {'loss':['ls','huber'],
          'n_estimators': [100,500],
          'min_samples_split': [2,4,6],
          'max_features': [0.7,0.9,1],
          'max_depth': [3,5,7],}

In [73]:
gsgbr = GridSearchCV(estimator=gbr, param_grid=params, 
                   n_jobs=-1, cv=3, scoring="mean_squared_error", 
                   verbose=1)

In [74]:
gsgbr.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   29.3s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  2.2min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=101,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'loss': ['ls', 'huber'], 'n_estimators': [100, 500], 'min_samples_split': [2, 4, 6], 'max_features': [0.7, 0.9, 1], 'max_depth': [3, 5, 7]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='mean_squared_error', verbose=1)

In [75]:
y_pred_gsgbr = gsgbr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_gsgbr)))

0.136469031938


## BaggingRegressor

In [82]:
br = BaggingRegressor(random_state=rs, n_jobs=-1)

In [83]:
br.fit(X_train, y_train)

BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=-1, oob_score=False, random_state=101,
         verbose=0, warm_start=False)

In [84]:
y_pred_br = br.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_br)))

0.140381789061


### GridSearch for BR

In [87]:
params = {'max_samples': [0.7,0.9,1],
          'n_estimators': [100,200,500],
          'max_features': [0.7,0.9,1],
          'bootstrap_features': [True, False],
          'bootstrap':[True, False]}

In [88]:
gsbr = GridSearchCV(estimator=br, param_grid=params, 
                   n_jobs=-1, cv=3, scoring="mean_squared_error", 
                   verbose=1)

In [89]:
gsbr.fit(X_train, y_train)

Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   53.6s
[Parallel(n_jobs=-1)]: Done 324 out of 324 | elapsed:  1.5min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=BaggingRegressor(base_estimator=None, bootstrap=True,
         bootstrap_features=False, max_features=1.0, max_samples=1.0,
         n_estimators=10, n_jobs=-1, oob_score=False, random_state=101,
         verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'max_samples': [0.7, 0.9, 1], 'n_estimators': [100, 200, 500], 'max_features': [0.7, 0.9, 1], 'bootstrap_features': [True, False], 'bootstrap': [True, False]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='mean_squared_error', verbose=1)

In [90]:
y_pred_gsbr = gsbr.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, y_pred_gsbr)))

0.195791888632


# Uploading to Kaggle

In [92]:
test.drop('id',axis=1,inplace=True)

In [93]:
preds = clf.predict(test)

In [94]:
df = pd.DataFrame()

In [95]:
t = pd.read_csv('test (1).csv')

In [96]:
df['id'] = t.id

In [97]:
df['radius'] = preds

In [98]:
df.head()

Unnamed: 0,id,radius
0,4365,0.272747
1,1129,0.680009
2,2602,0.278276
3,1747,0.69041
4,3286,0.291265


In [100]:
df.to_csv('SecondTry.csv', index=False)