In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#loading data
df = pd.read_csv('train.csv')
df

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9907,ffbfa0383c34dc513c95560d6e1fdb57,0,0,0,1,0,0,0,0,0,0,0,1,15
9908,ffcc8532d76436fc79e50eb2e5238e45,0,1,1,1,0,0,0,0,0,0,0,0,70
9909,ffdf2e8673a1da6fb80342fa3b119a20,0,1,1,1,0,0,0,0,1,1,0,0,20
9910,fff19e2ce11718548fa1c5d039a5192a,0,1,1,1,0,0,0,0,1,0,0,0,20


# Cleaning data

In [3]:
from sklearn.preprocessing import MinMaxScaler

df['sqrt_tranformed_Pawpularity']= df.Pawpularity.transform(np.sqrt)

scaler = MinMaxScaler()
df['scaled_tranformed_Pawpularity']=scaler.fit_transform(df[['sqrt_tranformed_Pawpularity']])

In [4]:
print(df.isnull().values.any())
df.head()

False


Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,sqrt_tranformed_Pawpularity,scaled_tranformed_Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63,7.937254,0.770806
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42,6.480741,0.608971
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28,5.291503,0.476834
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15,3.872983,0.31922
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72,8.485281,0.831698


In [5]:
df=df.loc[(df.Pawpularity>4) & (df.Pawpularity<100)]
print(df.shape)
df.head()

(9417, 16)


Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity,sqrt_tranformed_Pawpularity,scaled_tranformed_Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63,7.937254,0.770806
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42,6.480741,0.608971
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28,5.291503,0.476834
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15,3.872983,0.31922
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72,8.485281,0.831698


In [6]:
#check to see any missing values
df.isnull().values.any()

False

# Spliting data

In [7]:
from sklearn.model_selection import train_test_split

x=df.drop(['Id','Pawpularity','sqrt_tranformed_Pawpularity','scaled_tranformed_Pawpularity'],axis=1)
y=df[['sqrt_tranformed_Pawpularity','scaled_tranformed_Pawpularity']]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.2, random_state = 1000, shuffle=True, stratify=y)


# Building Models

In [8]:
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor


from sklearn import metrics

#models that rquire no scaling
model_list=[
            LinearRegression(),AdaBoostRegressor(),ElasticNet(),DecisionTreeRegressor(),
            RandomForestRegressor()
            ]

#models tha require scaled data
scaled_model=[
            RadiusNeighborsRegressor(),KNeighborsRegressor(),MLPRegressor()
             ]

# Kfold cross validating the models that dont require scaled data

In [9]:

data={'model': [str(i) for i in model_list ],
      'r2_score':[cross_val_score(i, x, y['sqrt_tranformed_Pawpularity'], cv=5, scoring ='r2').mean() for i in model_list ],
      'explained_variance':[cross_val_score(i, x, y['sqrt_tranformed_Pawpularity'], cv=5, scoring ='explained_variance').mean() for i in model_list],
      'max_error':[cross_val_score(i, x, y['sqrt_tranformed_Pawpularity'], cv=5, scoring ='max_error').mean() for i in model_list],
      'neg_mean_absolute_error':[cross_val_score(i, x, y['sqrt_tranformed_Pawpularity'], cv=5, scoring ='neg_mean_absolute_error').mean() for i in model_list],
      'neg_mean_absolute_percentage_error':[cross_val_score(i, x, y['sqrt_tranformed_Pawpularity'], cv=5, scoring ='neg_mean_absolute_percentage_error').mean() for i in model_list]
     }

scores= pd.DataFrame.from_dict(data)
scores

Unnamed: 0,model,r2_score,explained_variance,max_error,neg_mean_absolute_error,neg_mean_absolute_percentage_error
0,LinearRegression(),0.0008,0.001057219,-4.056809,-1.079128,-0.198917
1,AdaBoostRegressor(),-0.007501,0.0004747775,-3.969836,-1.099789,-0.206868
2,ElasticNet(),-0.000224,6.661338000000001e-17,-4.003141,-1.079636,-0.199078
3,DecisionTreeRegressor(),-0.026581,-0.02654527,-4.828194,-1.091923,-0.200907
4,RandomForestRegressor(),-0.016039,-0.01637499,-4.531598,-1.087419,-0.200325


# Kfold cross validating the models that require scaled data


In [10]:
data2={'model': [str(i) for i in scaled_model ],
      'r2_score':[cross_val_score(i, x, y['scaled_tranformed_Pawpularity'], cv=5, scoring ='r2').mean() for i in scaled_model ],
      'explained_variance':[cross_val_score(i, x, y['scaled_tranformed_Pawpularity'], cv=5, scoring ='explained_variance').mean() for i in scaled_model],
      'max_error':[cross_val_score(i, x, y['scaled_tranformed_Pawpularity'], cv=5, scoring ='max_error').mean() for i in scaled_model],
      'neg_mean_absolute_error':[cross_val_score(i, x, y['scaled_tranformed_Pawpularity'], cv=5, scoring ='neg_mean_absolute_error').mean() for i in scaled_model],
      'neg_mean_absolute_percentage_error':[cross_val_score(i, x, y['scaled_tranformed_Pawpularity'], cv=5, scoring ='neg_mean_absolute_percentage_error').mean() for i in scaled_model]
     }

scores2= pd.DataFrame.from_dict(data2)
scores2

Unnamed: 0,model,r2_score,explained_variance,max_error,neg_mean_absolute_error,neg_mean_absolute_percentage_error
0,RadiusNeighborsRegressor(),0.000495,0.000713,-0.448812,-0.119828,-0.255414
1,KNeighborsRegressor(),-0.230691,-0.209183,-0.549352,-0.133376,-0.273401
2,MLPRegressor(),-0.006908,-0.002062,-0.497907,-0.119934,-0.251508


In [11]:
# concatenating scores1 and scores2 along rows
scores_df = pd.concat([scores, scores2], axis=0,ignore_index=True)

#sort df according to the r2_score by descending order.
scores_df.sort_values('r2_score',ascending=False)

Unnamed: 0,model,r2_score,explained_variance,max_error,neg_mean_absolute_error,neg_mean_absolute_percentage_error
0,LinearRegression(),0.0008,0.001057219,-4.056809,-1.079128,-0.198917
5,RadiusNeighborsRegressor(),0.000495,0.000712718,-0.448812,-0.119828,-0.255414
2,ElasticNet(),-0.000224,6.661338000000001e-17,-4.003141,-1.079636,-0.199078
7,MLPRegressor(),-0.006908,-0.002061581,-0.497907,-0.119934,-0.251508
1,AdaBoostRegressor(),-0.007501,0.0004747775,-3.969836,-1.099789,-0.206868
4,RandomForestRegressor(),-0.016039,-0.01637499,-4.531598,-1.087419,-0.200325
3,DecisionTreeRegressor(),-0.026581,-0.02654527,-4.828194,-1.091923,-0.200907
6,KNeighborsRegressor(),-0.230691,-0.2091832,-0.549352,-0.133376,-0.273401


In [12]:
from sklearn.model_selection import KFold

mod=RadiusNeighborsRegressor()
kfolds=[i for i in range(5,13)]
rndm_state=[i for i in range(100,1001,10)]
d1={
    'Kfolds':kfolds,
    'r2_score':[cross_val_score(mod, x, y['scaled_tranformed_Pawpularity'], cv=i, scoring ='r2').mean() for i in kfolds]
   }

In [13]:
pd.DataFrame.from_dict(d1).sort_values('r2_score',ascending=False).head(10)

Unnamed: 0,Kfolds,r2_score
4,9,0.001201
3,8,0.00108
1,6,0.000983
6,11,0.000627
0,5,0.000495
2,7,0.000374
5,10,0.000247
7,12,7.8e-05


In [None]:
from sklearn.model_selection import GridSearchCV

cv=KFold(n_splits=9, shuffle=True, random_state=1000)

#warning this grid search will take a verry long time , modify the params for less time cosuming process.
params={
        'radius':[i for i in np.arange(1,5,.25)],
        'weights':['uniform','distance'],
        'algorithm':['auto', 'ball_tree', 'kd_tree', 'brute'],
        'leaf_size':[15,30,45,50,100,150,300],
        'n_jobs':[-1]
       }

search = GridSearchCV(mod, params, scoring='r2', cv=cv, refit=True)

result = search.fit(x_train, y_train)

# get the best performing model fit on the whole training set
best_model = result.best_estimator_
print(best_model)
print(result.best_score_)

In [None]:
kfolds2=[i for i in range(5,11)]
d2={
    'Kfolds':kfolds2,
    'r2_score':[cross_val_score(mod, x, y['sqrt_tranformed_Pawpularity'], cv=i, scoring ='r2').mean() for i in kfolds2]
   }

pd.DataFrame.from_dict(d1).sort_values('r2_score',ascending=False).head(10)

In [None]:
cv2=KFold(n_splits=9, shuffle=True, random_state=1000)

mod2=ElasticNet()

params2={
         'alpha':[i for i in np.arange(.25,1.6,.25)],
         'l1_ratio':[i for i in np.arange(0,1,.25) ],
         'max_iter':[10000,100000,1000000],
         'normalize':[False,True],
         'warm_start':[False,True],
         'random_state':[10,20,30,40,50,100,1000],
         'selection':['random']
        }

search2 = GridSearchCV(mod2, params2, scoring='r2', cv=cv2, refit=True)

result2 = search2.fit(x_train, y_train)

# get the best performing model fit on the whole training set
best_model = result.best_estimator_
print(best_model)
print(result.best_score_)