# Mercedes-Benz Greener Manufacturing 

### Importing Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

### Reading Data from train.csv and Understanding the data set.

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
df_train.shape

(4209, 378)

In [4]:
df_train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [5]:
df_train.describe()

Unnamed: 0,ID,y,X10,X11,X12,X13,X14,X15,X16,X17,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
count,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,...,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0,4209.0
mean,4205.960798,100.669318,0.013305,0.0,0.075077,0.057971,0.42813,0.000475,0.002613,0.007603,...,0.318841,0.057258,0.314802,0.02067,0.009503,0.008078,0.007603,0.001663,0.000475,0.001426
std,2437.608688,12.679381,0.11459,0.0,0.263547,0.233716,0.494867,0.021796,0.051061,0.086872,...,0.466082,0.232363,0.464492,0.142294,0.097033,0.089524,0.086872,0.040752,0.021796,0.037734
min,0.0,72.11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2095.0,90.82,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,4220.0,99.15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,6314.0,109.01,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,8417.0,265.32,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
df_train.columns

Index(['ID', 'y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=378)

### Reading data from test.csv

In [7]:
df_test = pd.read_csv('test.csv')

### Function to check for variance of the columns, and drop those columns from the data

In [8]:
def col_with_zero_var(df):
    var_df=pd.DataFrame(df.var(),columns=['Variance'])
    return(list(var_df[var_df.Variance==0].index))

df_train.drop(columns= col_with_zero_var(df_train))
df_test.drop(columns= col_with_zero_var(df_test))

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,8410,aj,h,as,f,d,aa,j,e,0,...,0,0,0,0,0,0,0,0,0,0
4205,8411,t,aa,ai,d,d,aa,j,y,0,...,0,1,0,0,0,0,0,0,0,0
4206,8413,y,v,as,f,d,aa,d,w,0,...,0,0,0,0,0,0,0,0,0,0
4207,8414,ak,v,as,a,d,aa,c,q,0,...,0,0,1,0,0,0,0,0,0,0


### Check for null and unique values of the data.

In [9]:
df_train_null = pd.DataFrame(df_train.isna().any(), columns=['IsNull'])
df_train_null[df_train_null.IsNull==True]

Unnamed: 0,IsNull


In [10]:
df_test_null = pd.DataFrame(df_test.isna().any(), columns=['IsNull'])
df_test_null[df_test_null.IsNull==True]

Unnamed: 0,IsNull


We do not have any null value in train and test data set

In [11]:
df_train_ucnt = pd.DataFrame(df_train.nunique(), columns=['uniqueCount'])
df_train_ucnt[df_train_ucnt.uniqueCount > 2]

Unnamed: 0,uniqueCount
ID,4209
y,2545
X0,47
X1,27
X2,44
X3,7
X4,4
X5,29
X6,12
X8,25


In [12]:
df_train_ucnt = pd.DataFrame(df_train.nunique(), columns=['uniqueCount'])
df_train_ucnt[df_train_ucnt.uniqueCount <= 2]

Unnamed: 0,uniqueCount
X10,2
X11,1
X12,2
X13,2
X14,2
...,...
X380,2
X382,2
X383,2
X384,2


This shows that x0-x8 is a categorical data.

In [13]:
df_test_ucnt = pd.DataFrame(df_test.nunique(), columns=['uniqueCount'])
df_test_ucnt[df_test_ucnt.uniqueCount > 2]

Unnamed: 0,uniqueCount
ID,4209
X0,49
X1,27
X2,45
X3,7
X4,4
X5,32
X6,12
X8,25


In [14]:
df_test_ucnt = pd.DataFrame(df_test.nunique(), columns=['uniqueCount'])
df_test_ucnt[df_test_ucnt.uniqueCount <= 2]

Unnamed: 0,uniqueCount
X10,2
X11,2
X12,2
X13,2
X14,2
...,...
X380,2
X382,2
X383,2
X384,2


### Converting categorical data into numerical by using Label Encoder

In [15]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()

df_train_num = df_train.apply(LabelEncoder().fit_transform)

In [16]:
df_train_num

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,2466,32,23,17,0,3,24,9,14,...,0,0,1,0,0,0,0,0,0,0
1,1,366,32,21,19,4,3,28,11,14,...,1,0,0,0,0,0,0,0,0,0
2,2,69,20,24,34,2,3,27,9,23,...,0,0,0,0,0,0,1,0,0,0
3,3,133,20,21,34,5,3,27,11,4,...,0,0,0,0,0,0,0,0,0,0
4,4,106,20,23,34,5,3,12,3,13,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,4204,1657,8,20,16,2,3,0,3,16,...,1,0,0,0,0,0,0,0,0,0
4205,4205,1766,31,16,40,3,3,0,7,7,...,0,1,0,0,0,0,0,0,0,0
4206,4206,1801,8,23,38,0,3,0,6,4,...,0,0,1,0,0,0,0,0,0,0
4207,4207,280,9,19,25,5,3,0,11,20,...,0,0,0,0,0,0,0,0,0,0


In [17]:
df_test_num = df_test.apply(LabelEncoder().fit_transform)
df_test_num

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,21,23,34,5,3,26,0,22,0,...,0,0,0,1,0,0,0,0,0,0
1,1,42,3,8,0,3,9,6,24,0,...,0,0,1,0,0,0,0,0,0,0
2,2,21,23,17,5,3,0,9,9,0,...,0,0,0,1,0,0,0,0,0,0
3,3,21,13,34,5,3,31,11,13,0,...,0,0,0,1,0,0,0,0,0,0
4,4,45,20,17,2,3,30,8,12,0,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4204,4204,6,9,17,5,3,1,9,4,0,...,0,0,0,0,0,0,0,0,0,0
4205,4205,42,1,8,3,3,1,9,24,0,...,0,1,0,0,0,0,0,0,0,0
4206,4206,47,23,17,5,3,1,3,22,0,...,0,0,0,0,0,0,0,0,0,0
4207,4207,7,23,17,0,3,1,2,16,0,...,0,0,1,0,0,0,0,0,0,0


### Removing column column y from train data set.

In [18]:
x_train = df_train_num.drop(columns=['y', 'ID'])

In [19]:
y_train = df_train_num['y']
ID_test = df_train_num['ID']

In [20]:
x_train.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X10,X11,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,32,23,17,0,3,24,9,14,0,0,...,0,0,1,0,0,0,0,0,0,0
1,32,21,19,4,3,28,11,14,0,0,...,1,0,0,0,0,0,0,0,0,0
2,20,24,34,2,3,27,9,23,0,0,...,0,0,0,0,0,0,1,0,0,0
3,20,21,34,5,3,27,11,4,0,0,...,0,0,0,0,0,0,0,0,0,0
4,20,23,34,5,3,12,3,13,0,0,...,0,0,0,0,0,0,0,0,0,0


### Dimentionality Reduction

In [21]:
pcamodel = PCA(n_components=12, random_state=420)

pca_train = pcamodel.fit_transform(x_train)
pca_test = pcamodel.fit_transform(df_test_num)

In [22]:
pca_train

array([[  0.6147646 ,  -0.13300945,  15.62446002, ...,   1.73755723,
          0.28952525,   0.35795542],
       [  0.56540665,   1.56033294,  17.9095812 , ...,  -0.13663602,
          0.76267149,  -0.36542499],
       [ 16.20171258,  12.29284626,  17.6335395 , ...,  -0.48523065,
         -1.03728306,   3.90826442],
       ...,
       [ 29.00466039,  14.86090532,  -7.75333217, ...,  -1.09563443,
          1.40196336,  -0.35867854],
       [ 22.97242171,   1.68482437,  -9.03124768, ...,   0.25497014,
          1.27432413,  -1.10542359],
       [-17.28304831,  -9.95198181,  -3.71935977, ...,   0.28689192,
          0.43212397,  -0.71584563]])

### Using XGBoost

In [23]:
X_train, X_valid, y_train, y_valid = train_test_split(pca_train, y_train, test_size = 0.2, random_state = 10)

In [24]:
d_train = xgb.DMatrix(X_train, label= y_train)
d_valid = xgb.DMatrix(X_valid, label= y_valid)

d_test = xgb.DMatrix(pca_test)

In [25]:
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4

def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)

watchlist = [(d_train, 'train'), (d_valid, 'valid')]

clf = xgb.train(params, d_train, 
                1000, watchlist, early_stopping_rounds=50, 
                feval=xgb_r2_score, maximize=True, verbose_eval=10)

[0]	train-rmse:1365.08850	train-r2:-2.85716	valid-rmse:1354.98889	valid-r2:-2.85815
[10]	train-rmse:1158.37231	train-r2:-1.77743	valid-rmse:1150.99939	valid-r2:-1.78393
[20]	train-rmse:995.52527	train-r2:-1.05140	valid-rmse:991.01495	valid-r2:-1.06380
[30]	train-rmse:868.46863	train-r2:-0.56119	valid-rmse:866.21051	valid-r2:-0.57672
[40]	train-rmse:770.51562	train-r2:-0.22888	valid-rmse:770.26709	valid-r2:-0.24678
[50]	train-rmse:695.12244	train-r2:-0.00016	valid-rmse:697.55078	valid-r2:-0.02249
[60]	train-rmse:637.62372	train-r2:0.15846	valid-rmse:643.37897	valid-r2:0.13016
[70]	train-rmse:595.02747	train-r2:0.26714	valid-rmse:603.21924	valid-r2:0.23536
[80]	train-rmse:563.73926	train-r2:0.34219	valid-rmse:573.80145	valid-r2:0.30812
[90]	train-rmse:539.98920	train-r2:0.39645	valid-rmse:551.88910	valid-r2:0.35995
[100]	train-rmse:521.93286	train-r2:0.43613	valid-rmse:535.73376	valid-r2:0.39688
[110]	train-rmse:508.09109	train-r2:0.46565	valid-rmse:523.75671	valid-r2:0.42354
[120]	train

### Predicting Values

In [26]:
p_test = clf.predict(d_test)

sub = pd.DataFrame()
sub['ID'] = ID_test
sub['y'] = p_test
sub.to_csv('test_output.csv', index=False)

sub.head()

Unnamed: 0,ID,y
0,0,1515.096802
1,1,1008.5354
2,2,1197.741089
3,3,1344.310425
4,4,1986.85022


In [27]:
sub

Unnamed: 0,ID,y
0,0,1515.096802
1,1,1008.535400
2,2,1197.741089
3,3,1344.310425
4,4,1986.850220
...,...,...
4204,4204,1838.143433
4205,4205,1777.755737
4206,4206,981.696594
4207,4207,462.775238
