In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import xgboost as xgb

# Loading Data

In [3]:
df_train = pd.read_csv(r"C:\Users\Asus\Downloads\train\train.csv")
df_test = pd.read_csv(r"C:\Users\Asus\Downloads\test\test.csv")

In [4]:
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [6]:
df_train.shape, df_test.shape

((4209, 378), (4209, 377))

In [7]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB


# Checking for number of unique values

In [8]:
df_train.nunique()

ID      4209
y       2545
X0        47
X1        27
X2        44
        ... 
X380       2
X382       2
X383       2
X384       2
X385       2
Length: 378, dtype: int64

In [9]:
df_test.nunique()

ID      4209
X0        49
X1        27
X2        45
X3         7
        ... 
X380       2
X382       2
X383       2
X384       2
X385       2
Length: 377, dtype: int64

# Dropping columns with zero variance and maximum variance

In [10]:
def drop_zero_and_max_variance(df):
    
    for col in df.columns:
        if df[col].nunique() == 1 or df[col].nunique() == df.shape[0]:
            df.drop(col,inplace=True,axis=1)
    return df

In [11]:
df_train.drop('y', axis = 1).columns

Index(['ID', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=377)

In [12]:
df_train = drop_zero_and_max_variance(df_train)
df_test = df_test[df_train.drop('y', axis = 1).columns]

In [13]:
df_train.shape, df_test.shape

((4209, 365), (4209, 364))

# Checking for null values

In [14]:
def check_null(df):
    for col in df.columns:
        if df_train[col].isnull().sum()>0:
            print(col)
print('columns having null in training set:',check_null(df_train))
print('columns having null in testing set:',check_null(df_test))

columns having null in training set: None
columns having null in testing set: None


# Applying label encoder on categorical columns

In [15]:
df_cat = df_train.select_dtypes('object')

In [16]:
df_cat.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [17]:
enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)

In [18]:
df_train[df_cat.columns] = enc.fit_transform(df_train[df_cat.columns])
df_test[df_cat.columns] = enc.transform(df_test[df_cat.columns])

In [19]:
df_train.head()

Unnamed: 0,y,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,130.81,32.0,23.0,17.0,0.0,3.0,24.0,9.0,14.0,0,...,0,0,1,0,0,0,0,0,0,0
1,88.53,32.0,21.0,19.0,4.0,3.0,28.0,11.0,14.0,0,...,1,0,0,0,0,0,0,0,0,0
2,76.26,20.0,24.0,34.0,2.0,3.0,27.0,9.0,23.0,0,...,0,0,0,0,0,0,1,0,0,0
3,80.62,20.0,21.0,34.0,5.0,3.0,27.0,11.0,4.0,0,...,0,0,0,0,0,0,0,0,0,0
4,78.02,20.0,23.0,34.0,5.0,3.0,12.0,3.0,13.0,0,...,0,0,0,0,0,0,0,0,0,0


# Performing PCA (Dimensionality reduction)

In [20]:
# splitting target variable from the training set
y = df_train['y']
df_train.drop(columns = 'y', inplace = True)

In [21]:
df_train.shape, y.shape

((4209, 364), (4209,))

In [22]:
scaler = StandardScaler()
df_train = pd.DataFrame(scaler.fit_transform(df_train))
df_test = pd.DataFrame(scaler.transform(df_test))

In [23]:
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,354,355,356,357,358,359,360,361,362,363
0,0.163012,1.393488,-0.028122,-1.67827,0.028938,1.292117,0.751787,0.339445,-0.116122,-0.284906,...,-0.684167,-0.246447,1.475332,-0.14528,-0.097952,-0.090243,-0.087527,-0.040815,-0.021804,-0.037783
1,0.163012,1.159021,0.155388,0.620969,0.028938,1.776974,1.437511,0.339445,-0.116122,-0.284906,...,1.46163,-0.246447,-0.677814,-0.14528,-0.097952,-0.090243,-0.087527,-0.040815,-0.021804,-0.037783
2,-0.71056,1.510721,1.531709,-0.52865,0.028938,1.65576,0.751787,1.618389,-0.116122,-0.284906,...,-0.684167,-0.246447,-0.677814,-0.14528,-0.097952,-0.090243,11.425027,-0.040815,-0.021804,-0.037783
3,-0.71056,1.159021,1.531709,1.195779,0.028938,1.65576,1.437511,-1.081605,-0.116122,-0.284906,...,-0.684167,-0.246447,-0.677814,-0.14528,-0.097952,-0.090243,-0.087527,-0.040815,-0.021804,-0.037783
4,-0.71056,1.393488,1.531709,1.195779,0.028938,-0.162454,-1.305384,0.19734,-0.116122,-0.284906,...,-0.684167,-0.246447,-0.677814,-0.14528,-0.097952,-0.090243,-0.087527,-0.040815,-0.021804,-0.037783


In [24]:
pca = PCA(0.9)
df_train = pca.fit_transform(df_train)
df_test = pca.transform(df_test)

In [25]:
pd.DataFrame(df_test)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,119
0,8.496261,19.650691,-6.174970,0.865484,1.126666,0.051558,0.202011,-0.988078,-3.537800,2.862318,...,-0.216632,2.788812,1.490081,3.680130,-1.144081,0.381362,1.986756,-1.239526,-0.300547,2.366231
1,1.840957,-5.704667,-9.458756,-1.684833,-2.115692,5.167815,3.841442,4.340022,-3.305837,1.571754,...,0.138988,2.383757,0.368137,3.263292,-2.078364,-0.501572,0.943965,-0.607577,-0.215723,2.434835
2,6.067102,16.560022,-3.767407,-1.620917,1.714536,-0.619394,1.226138,3.486122,-4.583004,2.198279,...,-1.140384,3.257870,-0.940981,3.392255,-2.471306,-1.599321,0.105851,-1.245616,1.158712,1.154714
3,6.612298,21.381589,-6.844751,0.954463,1.979569,0.194040,-0.811764,0.119403,0.755474,0.815460,...,1.468236,-1.653820,-1.381650,1.902573,1.742217,-0.858690,-1.258202,-0.249810,-3.259272,1.581916
4,-1.903052,0.477852,5.655123,-3.179762,-0.772387,-1.118376,0.729745,2.482106,0.355310,-1.472467,...,0.590241,0.164825,-0.333449,-0.515322,0.735497,0.398243,-0.173327,0.678535,0.388064,-0.125787
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,-1.607417,1.319789,5.766881,-0.473945,-0.466380,-1.031149,-3.518693,0.522520,0.033799,-1.517927,...,0.059894,0.184336,0.461749,0.834154,-0.242279,-0.082272,0.266422,-0.248410,0.055012,0.379730
4205,0.165584,-1.998499,-3.967878,0.625854,-2.872814,1.003097,3.113393,3.559782,-2.526358,-0.965975,...,0.120995,-0.803020,0.233542,1.399913,0.685795,-0.920788,0.742613,-0.328754,-0.266528,0.353025
4206,-2.692341,0.655082,3.161703,2.635253,-0.060109,-1.232795,-4.114878,1.115526,-0.264871,-0.255883,...,0.910510,-1.129656,0.696121,-0.102987,-0.962370,-1.164933,-1.012755,-0.440122,1.055608,0.371454
4207,-1.318860,0.250077,4.058947,-4.396865,0.329428,-0.818273,1.554524,0.666475,0.719681,-1.116148,...,-0.294009,-0.150697,0.051093,-1.229090,-0.692193,0.371539,0.383275,-0.162154,-0.186675,0.454601


In [26]:
pd.DataFrame(df_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,119
0,12.254467,-2.944210,-0.957850,2.016656,-1.530604,-3.608552,9.025599,-3.758838,-15.990324,8.525470,...,-0.119810,1.267473,0.380238,1.385268,-0.470509,-0.089729,0.394740,-0.238452,0.015715,0.592542
1,-0.102546,0.366062,0.997224,1.159062,-2.586970,-0.042211,1.004566,-0.610896,-0.430923,0.192279,...,-0.874539,0.881529,-1.121329,-0.662926,1.083840,1.210264,-0.676214,2.020618,-0.870472,-0.805464
2,10.278318,21.104928,-5.013299,-4.470132,0.889347,2.558879,1.150612,3.615687,-0.877439,1.341727,...,2.955559,0.727614,-4.179833,0.838185,3.628045,0.132504,-0.732693,1.227602,-7.231710,1.510685
3,7.361874,21.368231,-5.999757,0.233264,2.000923,0.686137,0.426410,-0.069478,0.298879,2.938231,...,-0.649586,-2.257947,1.990759,2.596414,-1.580493,-0.419420,-0.562334,-0.293221,2.266469,0.179956
4,6.576140,21.442041,-6.547401,0.935237,2.002006,-0.030472,-0.983385,-0.261462,1.024795,0.842633,...,0.580078,-1.626224,-2.708540,2.548958,0.373760,-0.204093,-1.079394,0.524784,-3.208139,0.734364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,-0.238587,0.187915,4.188999,-3.992993,-1.009858,0.221024,1.697285,2.890916,0.595330,-0.381392,...,0.373441,-0.807837,-0.864035,-0.053650,0.173101,-0.517071,-0.389554,-0.795420,-0.535712,-0.200691
4205,11.944442,-2.809733,-0.853236,12.175034,5.907609,-17.945413,11.907167,5.095052,20.573600,-4.321201,...,0.742315,1.136693,-0.612837,0.329206,-0.411473,-0.663939,-0.237351,0.348584,0.563340,0.720589
4206,0.436968,0.897529,3.445222,-4.669025,-0.619090,-0.291540,4.283213,-3.413901,0.160703,2.410645,...,-0.154711,0.276372,-0.167615,-0.205973,-0.482873,-0.200563,-0.451584,0.285431,0.380994,0.228184
4207,-1.341672,0.589253,-0.109655,5.666094,-0.268606,-0.375517,-3.991283,-0.375432,-0.855629,1.492658,...,0.212473,-0.069969,-0.347650,-0.052454,-0.440977,0.002881,0.156327,0.695514,0.521780,-0.173412


# Training and Evaluation

In [27]:
model = xgb.XGBRegressor()

In [28]:
model = model.fit(df_train, y.values)

In [29]:
from sklearn.model_selection import GridSearchCV
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}
clf = GridSearchCV(estimator=model, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=1)

In [30]:
clf.fit(df_train, y.values)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best parameters: {'colsample_bytree': 0.3, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 500}
Lowest RMSE:  8.996034521715396


In [31]:
tuned_model = xgb.XGBRegressor(colsample_bytree = 0.3, learning_rate = 0.05, max_depth = 3, n_estimators = 500)

In [32]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=5)
scores = cross_val_score(tuned_model, df_train, y, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

In [34]:
-scores

array([ 71.68606015, 122.78415325,  71.6598939 ,  83.35001003,
        55.16306824])

In [35]:
print(-scores.mean())

80.92863711589516


In [36]:
tuned_model = tuned_model.fit(df_train, y.values)
pred = tuned_model.predict(df_test)

In [37]:
df_test = pd.DataFrame(df_test)
df_test['pred'] = pred

In [38]:
df_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,pred
0,8.496261,19.650691,-6.17497,0.865484,1.126666,0.051558,0.202011,-0.988078,-3.5378,2.862318,...,2.788812,1.490081,3.68013,-1.144081,0.381362,1.986756,-1.239526,-0.300547,2.366231,99.127724
1,1.840957,-5.704667,-9.458756,-1.684833,-2.115692,5.167815,3.841442,4.340022,-3.305837,1.571754,...,2.383757,0.368137,3.263292,-2.078364,-0.501572,0.943965,-0.607577,-0.215723,2.434835,116.05265
2,6.067102,16.560022,-3.767407,-1.620917,1.714536,-0.619394,1.226138,3.486122,-4.583004,2.198279,...,3.25787,-0.940981,3.392255,-2.471306,-1.599321,0.105851,-1.245616,1.158712,1.154714,102.02523
3,6.612298,21.381589,-6.844751,0.954463,1.979569,0.19404,-0.811764,0.119403,0.755474,0.81546,...,-1.65382,-1.38165,1.902573,1.742217,-0.85869,-1.258202,-0.24981,-3.259272,1.581916,83.478874
4,-1.903052,0.477852,5.655123,-3.179762,-0.772387,-1.118376,0.729745,2.482106,0.35531,-1.472467,...,0.164825,-0.333449,-0.515322,0.735497,0.398243,-0.173327,0.678535,0.388064,-0.125787,104.267067
