## Importing required libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')

## Loading the test and Train Sets

In [2]:
d_train= pd.read_csv('train.csv')
d_test= pd.read_csv('test.csv')

In [3]:
d_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
d_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [5]:
print(d_train.shape)
print(d_test.shape)

(4209, 378)
(4209, 377)


In [6]:
print(d_train.columns)
print(d_test.columns)

Index(['ID', 'y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=378)
Index(['ID', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X10',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=377)


In [7]:
print(d_train.info())
print(d_test.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, ID to X385
dtypes: int64(369), object(8)
memory usage: 12.1+ MB
None


## Deleting the variables with Variance = 0

In [8]:
def var_zero(df):
    var_data= pd.DataFrame(df.var(),columns=['Variance'])
    return(list(var_data[var_data.Variance==0].index))

In [9]:
varzero= var_zero(d_train)
varzero_test= var_zero(d_test)

In [10]:
var_data= pd.DataFrame(d_train.var(),columns=['Variance'])
var_data_test= pd.DataFrame(d_test.var(),columns=['Variance'])

In [11]:
d_train.drop(columns=varzero, inplace=True)
d_train.shape

(4209, 366)

In [12]:
d_test.drop(columns=varzero_test, inplace=True)
d_test.shape

(4209, 372)

In [13]:
print(d_train.shape)
print(d_test.shape)

(4209, 366)
(4209, 372)


## Check for null Values for Test and Train set and Deleting them

In [14]:
null_check_df= pd.DataFrame(d_train.isnull().any(), columns=['IsNull'])
null_check_df[null_check_df.IsNull==True]

Unnamed: 0,IsNull


In [15]:
null_check_df_test= pd.DataFrame(d_test.isnull().any(), columns=['IsNull'])
null_check_df_test[null_check_df_test.IsNull==True]

Unnamed: 0,IsNull


In [16]:
unique_count= pd.DataFrame(d_train.nunique(), columns=['Unique'])
unique_count[unique_count.Unique>2]

Unnamed: 0,Unique
ID,4209
y,2545
X0,47
X1,27
X2,44
X3,7
X4,4
X5,29
X6,12
X8,25


In [17]:
unique_count_test= pd.DataFrame(d_test.nunique(), columns=['Unique'])
unique_count_test[unique_count_test.Unique>2]

Unnamed: 0,Unique
ID,4209
X0,49
X1,27
X2,45
X3,7
X4,4
X5,32
X6,12
X8,25


In [18]:
print(unique_count[unique_count.Unique<=2])
print(unique_count_test[unique_count_test.Unique<=2])

      Unique
X10        2
X12        2
X13        2
X14        2
X15        2
...      ...
X380       2
X382       2
X383       2
X384       2
X385       2

[356 rows x 1 columns]
      Unique
X10        2
X11        2
X12        2
X13        2
X14        2
...      ...
X380       2
X382       2
X383       2
X384       2
X385       2

[363 rows x 1 columns]


# Applying Label Encoder to categorical values

In [19]:
x= d_train.drop(columns='y')
y=d_train.y

In [20]:
x_test= d_test

In [21]:
x.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,k,v,at,a,d,u,j,o,0,...,0,0,1,0,0,0,0,0,0,0
1,6,k,t,av,e,d,y,l,o,0,...,1,0,0,0,0,0,0,0,0,0
2,7,az,w,n,c,d,x,j,x,0,...,0,0,0,0,0,0,1,0,0,0
3,9,az,t,n,f,d,x,l,e,0,...,0,0,0,0,0,0,0,0,0,0
4,13,az,v,n,f,d,h,d,n,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
x_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [23]:
cat_cols= d_train.select_dtypes(include='object').columns
cat_cols

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8'], dtype='object')

In [24]:
def transform_labels(df,x):
    cat_cols= df.select_dtypes(include='object').columns
    le= LabelEncoder()
    for i in cat_cols:
        le.fit(x[i])
        x[i]=le.fit_transform(x[i])
        
transform_labels(d_train,x)
transform_labels(d_test,x_test)

In [25]:
x.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,32,23,17,0,3,24,9,14,0,...,0,0,1,0,0,0,0,0,0,0
1,6,32,21,19,4,3,28,11,14,0,...,1,0,0,0,0,0,0,0,0,0
2,7,20,24,34,2,3,27,9,23,0,...,0,0,0,0,0,0,1,0,0,0
3,9,20,21,34,5,3,27,11,4,0,...,0,0,0,0,0,0,0,0,0,0
4,13,20,23,34,5,3,12,3,13,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
x_test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,21,23,34,5,3,26,0,22,0,...,0,0,0,1,0,0,0,0,0,0
1,2,42,3,8,0,3,9,6,24,0,...,0,0,1,0,0,0,0,0,0,0
2,3,21,23,17,5,3,0,9,9,0,...,0,0,0,1,0,0,0,0,0,0
3,4,21,13,34,5,3,31,11,13,0,...,0,0,0,1,0,0,0,0,0,0
4,5,45,20,17,2,3,30,8,12,0,...,1,0,0,0,0,0,0,0,0,0


# Dimensionality Reduction

In [27]:
pcamodel= PCA()

In [28]:
pcamodel.fit(x)

PCA()

In [29]:
pcamodel.explained_variance_ratio_

array([9.99915408e-01, 3.43250074e-05, 1.91351988e-05, 1.09402163e-05,
       8.24910539e-06, 6.58920898e-06, 1.42452645e-06, 6.65804820e-07,
       3.88293669e-07, 2.63001372e-07, 2.16183012e-07, 2.11583571e-07,
       1.82010530e-07, 1.49422616e-07, 1.32555912e-07, 1.15098844e-07,
       9.26583708e-08, 8.57310762e-08, 8.00377113e-08, 7.03657987e-08,
       6.35342210e-08, 5.76627256e-08, 5.56826130e-08, 5.23078897e-08,
       4.58894501e-08, 4.32402600e-08, 3.87030746e-08, 3.70665275e-08,
       3.38043076e-08, 3.29543449e-08, 3.18590240e-08, 2.94860866e-08,
       2.81651246e-08, 2.68359999e-08, 2.52547391e-08, 2.39617111e-08,
       2.22318617e-08, 1.98219061e-08, 1.91972961e-08, 1.85152433e-08,
       1.75969872e-08, 1.66781887e-08, 1.59642978e-08, 1.57449519e-08,
       1.56219632e-08, 1.51186599e-08, 1.36807905e-08, 1.33843808e-08,
       1.28717525e-08, 1.26621251e-08, 1.23723086e-08, 1.19512796e-08,
       1.15990811e-08, 1.12811399e-08, 1.06850368e-08, 1.02643238e-08,
      

In [30]:
pcamodel.explained_variance_ratio_.cumsum()*100

array([ 99.99154082,  99.99497332,  99.99688684,  99.99798086,
        99.99880577,  99.9994647 ,  99.99960715,  99.99967373,
        99.99971256,  99.99973886,  99.99976048,  99.99978163,
        99.99979984,  99.99981478,  99.99982803,  99.99983954,
        99.99984881,  99.99985738,  99.99986539,  99.99987242,
        99.99987878,  99.99988454,  99.99989011,  99.99989534,
        99.99989993,  99.99990425,  99.99990812,  99.99991183,
        99.99991521,  99.99991851,  99.99992169,  99.99992464,
        99.99992746,  99.99993014,  99.99993267,  99.99993506,
        99.99993729,  99.99993927,  99.99994119,  99.99994304,
        99.9999448 ,  99.99994647,  99.99994806,  99.99994964,
        99.9999512 ,  99.99995271,  99.99995408,  99.99995542,
        99.99995671,  99.99995797,  99.99995921,  99.99996041,
        99.99996157,  99.99996269,  99.99996376,  99.99996479,
        99.99996575,  99.99996671,  99.99996763,  99.99996854,
        99.99996942,  99.99997026,  99.99997107,  99.99

In [31]:
pcamodel.components_

array([[ 9.99997515e-01, -7.34609541e-05,  2.99568235e-04, ...,
        -1.56006132e-07, -1.37295095e-07,  4.49823896e-07],
       [-8.25418421e-05, -9.31138475e-01,  2.44312134e-01, ...,
         3.33638024e-06, -9.30807937e-06, -3.01954607e-05],
       [ 1.10694145e-04,  2.65106027e-01, -1.34229841e-02, ...,
        -9.46573002e-05, -7.35049264e-07, -7.50666635e-06],
       ...,
       [ 0.00000000e+00, -1.69619713e-18, -6.64893456e-18, ...,
        -1.31025833e-16,  7.21536546e-17, -5.14426512e-03],
       [ 0.00000000e+00, -1.37158063e-18,  2.50906476e-18, ...,
         1.14383329e-16, -2.39608680e-17, -1.11259595e-02],
       [ 0.00000000e+00,  7.90823841e-16, -7.38647939e-16, ...,
         1.00236659e-13,  4.73672750e-14,  1.35029550e-02]])

ID column alone explains 99% of features

In [32]:
pcamodel= PCA(n_components=120)

In [33]:
x.drop(columns=['ID'], inplace=True)

In [34]:
pcamodel.fit(x)

PCA(n_components=120)

In [35]:
x_pca= pd.DataFrame(pcamodel.transform(x))

In [36]:
pcamodel.explained_variance_ratio_

array([3.83347821e-01, 2.13880326e-01, 1.32618659e-01, 1.18266425e-01,
       9.20600842e-02, 1.59060433e-02, 7.44539716e-03, 4.33701278e-03,
       2.94020637e-03, 2.41796178e-03, 2.36488035e-03, 2.03228597e-03,
       1.67203776e-03, 1.48111202e-03, 1.28702378e-03, 1.03575260e-03,
       9.57334243e-04, 8.96963711e-04, 7.85687496e-04, 7.09719878e-04,
       6.44381974e-04, 6.23812984e-04, 5.84619689e-04, 5.12442941e-04,
       4.84778034e-04, 4.33105519e-04, 4.13937269e-04, 3.77641392e-04,
       3.68612369e-04, 3.56074501e-04, 3.29531653e-04, 3.14483888e-04,
       2.99702523e-04, 2.82667164e-04, 2.68166783e-04, 2.48543866e-04,
       2.21385193e-04, 2.14536509e-04, 2.07246666e-04, 1.96497966e-04,
       1.87129242e-04, 1.78741747e-04, 1.75868001e-04, 1.74483476e-04,
       1.70721494e-04, 1.53056235e-04, 1.49449450e-04, 1.43767040e-04,
       1.41453153e-04, 1.38275598e-04, 1.33457885e-04, 1.29756311e-04,
       1.25976813e-04, 1.19332745e-04, 1.14673666e-04, 1.08117852e-04,
      

In [37]:
pcamodel.explained_variance_ratio_.cumsum()*100

array([38.33478209, 59.72281468, 72.98468058, 84.81132306, 94.01733148,
       95.60793581, 96.35247553, 96.7861768 , 97.08019744, 97.32199362,
       97.55848165, 97.76171025, 97.92891403, 98.07702523, 98.20572761,
       98.30930287, 98.40503629, 98.49473266, 98.57330141, 98.6442734 ,
       98.7087116 , 98.7710929 , 98.82955487, 98.88079916, 98.92927696,
       98.97258751, 99.01398124, 99.05174538, 99.08860662, 99.12421407,
       99.15716723, 99.18861562, 99.21858587, 99.24685259, 99.27366927,
       99.29852366, 99.32066217, 99.34211583, 99.36284049, 99.38249029,
       99.40120321, 99.41907739, 99.43666419, 99.45411254, 99.47118468,
       99.48649031, 99.50143525, 99.51581196, 99.52995727, 99.54378483,
       99.55713062, 99.57010625, 99.58270393, 99.59463721, 99.60610457,
       99.61691636, 99.62761471, 99.63790823, 99.64804337, 99.65791369,
       99.66721827, 99.67629323, 99.68498626, 99.69360147, 99.70155283,
       99.70935019, 99.71695557, 99.72445086, 99.73152755, 99.73

In [38]:
pcamodel.components_

array([[-9.31138471e-01,  2.45494127e-01,  2.57713643e-01, ...,
         2.84628944e-06, -9.56497803e-06, -2.86842355e-05],
       [ 2.63046285e-01, -1.92186472e-02,  9.59720661e-01, ...,
        -9.23858305e-05, -5.16075133e-07, -1.32382564e-05],
       [ 1.40346165e-01,  5.51248020e-01,  2.15461586e-03, ...,
        -1.13659802e-04,  4.64816456e-05,  1.61546453e-04],
       ...,
       [-2.75072943e-04, -3.89022773e-03,  1.75279966e-03, ...,
        -8.84142323e-03,  4.18955286e-03,  2.88787206e-02],
       [ 9.44157787e-05, -1.81351653e-03, -1.53609317e-03, ...,
        -9.93346813e-03, -2.08840312e-03,  7.89581097e-03],
       [-1.72317482e-04,  4.80670572e-03,  1.76962263e-03, ...,
         1.05355772e-03, -1.58024758e-02,  2.22179508e-02]])

In [39]:
y

0       130.81
1        88.53
2        76.26
3        80.62
4        78.02
         ...  
4204    107.39
4205    108.77
4206    109.22
4207     87.48
4208    110.85
Name: y, Length: 4209, dtype: float64

In [40]:
x_pca.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,119
0,0.614765,-0.133009,15.62446,3.687564,1.359574,-2.691417,3.816741,1.537687,-0.647686,1.737523,...,0.325595,-0.267177,-0.352285,-0.275256,-0.696345,0.469774,0.013925,0.280376,0.224846,0.204569
1,0.565407,1.560333,17.909581,-0.092902,1.536648,-4.442877,-0.79654,0.812571,-0.789268,-0.136565,...,0.119064,-0.236431,0.153502,0.365,0.094831,-0.02737,-0.038581,0.169749,-0.015532,-0.042006
2,16.201713,12.292846,17.63354,0.186308,11.85082,-2.155389,0.82819,1.580676,-0.608199,-0.485257,...,0.055862,0.612943,-0.22703,-0.042635,-0.055516,-0.114175,-0.386433,-0.095076,0.32681,-0.047126
3,16.149998,13.535419,14.898695,-3.140917,-6.832193,-4.290014,-2.225924,2.099091,0.040956,-0.979216,...,0.233451,0.001811,-0.099238,-0.115424,0.083557,-0.079815,-0.557618,-0.653386,0.535109,-0.254688
4,16.459103,13.175004,4.403096,7.671151,2.139916,3.76386,-1.954082,2.103654,1.178386,-1.021905,...,-0.040962,-0.128235,0.103794,-0.053379,-0.207269,-0.152531,-0.040158,-0.118188,0.143286,0.086752


## Building Model using XGBRegressor

In [41]:
xtrain, xtest, ytrain, ytest= train_test_split(x_pca, y, test_size= 0.25, random_state= 10)

In [42]:
xgbModel=XGBRegressor(max_depth=3, min_child_weight=4, reg_lambda=1000, gamma = 100)

In [43]:
xgbModel.fit(xtrain,ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=100, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=3,
             min_child_weight=4, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1000, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [44]:
xgbModel.score(xtrain,ytrain)

0.6302583184254844

In [45]:
xgbModel.score(xtest, ytest)

0.5256026378017694

In [46]:
xgb_feature_imp=(pd.DataFrame(xgbModel.feature_importances_,index=x_pca.columns))

In [47]:
xgb_feature_imp.sort_values(by=0)#,ascending=False)

Unnamed: 0,0
119,0.000000
50,0.000000
65,0.000000
69,0.000000
26,0.000000
...,...
9,0.028323
7,0.036336
0,0.042882
16,0.055635


In [48]:
xgb_feature_imp[xgb_feature_imp[0]==0].index.values.size

21

## Important observation here is, Feature importance is 0 for 21 features.

In [49]:
x_test

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,21,23,34,5,3,26,0,22,0,...,0,0,0,1,0,0,0,0,0,0
1,2,42,3,8,0,3,9,6,24,0,...,0,0,1,0,0,0,0,0,0,0
2,3,21,23,17,5,3,0,9,9,0,...,0,0,0,1,0,0,0,0,0,0
3,4,21,13,34,5,3,31,11,13,0,...,0,0,0,1,0,0,0,0,0,0
4,5,45,20,17,2,3,30,8,12,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,6,9,17,5,3,1,9,4,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,42,1,8,3,3,1,9,24,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,47,23,17,5,3,1,3,22,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,7,23,17,0,3,1,2,16,0,...,0,0,1,0,0,0,0,0,0,0


In [50]:
x_test.shape

(4209, 372)

In [51]:
x =x_test.drop(columns=['ID'])

In [52]:
x

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,3,26,0,22,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,3,9,6,24,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,3,0,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,3,31,11,13,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,3,30,8,12,0,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,6,9,17,5,3,1,9,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4205,42,1,8,3,3,1,9,24,0,0,...,0,1,0,0,0,0,0,0,0,0
4206,47,23,17,5,3,1,3,22,0,0,...,0,0,0,0,0,0,0,0,0,0
4207,7,23,17,0,3,1,2,16,0,0,...,0,0,1,0,0,0,0,0,0,0


In [53]:
transform_labels(x_test, x)

In [54]:
x.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,3,26,0,22,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,3,9,6,24,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,3,0,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,3,31,11,13,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,3,30,8,12,0,0,...,1,0,0,0,0,0,0,0,0,0


Dimensionality Reduction in Test Data

In [55]:
#x_pca_test= perform_pca(x_test,x)
pcamodel.fit(x_test)
x_pca_test= pd.DataFrame(pcamodel.transform(x_test))

In [56]:
x_pca_test.shape

(4209, 120)

In [57]:
xgb_feature_imp=(pd.DataFrame(xgbModel.feature_importances_,index=x_pca.columns))
xgb_feature_imp.sort_values(by=0)#,ascending=False)

Unnamed: 0,0
119,0.000000
50,0.000000
65,0.000000
69,0.000000
26,0.000000
...,...
9,0.028323
7,0.036336
0,0.042882
16,0.055635


In [58]:
x=x_test.drop(columns=['ID'])

transform_labels(x_test,x)

In [59]:
x.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,21,23,34,5,3,26,0,22,0,0,...,0,0,0,1,0,0,0,0,0,0
1,42,3,8,0,3,9,6,24,0,0,...,0,0,1,0,0,0,0,0,0,0
2,21,23,17,5,3,0,9,9,0,0,...,0,0,0,1,0,0,0,0,0,0
3,21,13,34,5,3,31,11,13,0,0,...,0,0,0,1,0,0,0,0,0,0
4,45,20,17,2,3,30,8,12,0,0,...,1,0,0,0,0,0,0,0,0,0


In [61]:
def perform_pca(df,x):
    pcaModel=PCA(n_components=7)
    pcaModel.fit(x.iloc[:,0:9])
    x_pca_c=pd.DataFrame(pcaModel.transform(x.iloc[:,0:9]),columns=['C1','C2','C3','C4','C5','C6','C7'])
    x_pca_c=pd.concat([x_pca_c,df.ID],axis=1)
    x_non_pca=pd.concat([x.iloc[:,9:],df.ID],axis=1)
    x_pca=pd.merge(left=x_pca_c,right=x_non_pca,on='ID',how='inner')
    x_pca.drop(columns=['ID'],inplace=True)
    return x_pca
x_pca_test =perform_pca(x_test,x)

In [62]:
x_pca_test.shape

(4209, 369)

In [64]:
pcamodel.fit(x_pca_test)
x_pca= pd.DataFrame(pcamodel.transform(x_pca_test))

## Predicting Y for test Data

In [66]:
test_df=pd.DataFrame(xgbModel.predict(x_pca),columns=['y'])

In [67]:
test_df['ID']=x_test.ID

In [68]:
test_df.tail(10)

Unnamed: 0,y,ID
4199,103.717232,8401
4200,108.118233,8404
4201,110.575272,8407
4202,108.72094,8408
4203,113.925919,8409
4204,103.225029,8410
4205,106.751686,8411
4206,110.101929,8413
4207,105.502182,8414
4208,92.239494,8416


In [69]:
test_df.to_csv('Mercedes_TestData_Predictions.csv',index=False)