# Mercedes-Benz Greener Manufacturing

In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

In [2]:
# Importing the data

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [4]:
test.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,az,v,n,f,d,t,a,w,0,...,0,0,0,1,0,0,0,0,0,0
1,2,t,b,ai,a,d,b,g,y,0,...,0,0,1,0,0,0,0,0,0,0
2,3,az,v,as,f,d,a,j,j,0,...,0,0,0,1,0,0,0,0,0,0
3,4,az,l,n,f,d,z,l,n,0,...,0,0,0,1,0,0,0,0,0,0
4,5,w,s,as,c,d,y,i,m,0,...,1,0,0,0,0,0,0,0,0,0


In [5]:
print('Size of training set: {} rows and {} columns'.format(*train.shape))
print('Size of testing set: {} rows and {} columns'.format(*test.shape))

Size of training set: 4209 rows and 378 columns
Size of testing set: 4209 rows and 377 columns


In [6]:
# Collect the Y values into an array
y_train = train['y'].values

In [7]:
y_train

array([130.81,  88.53,  76.26, ..., 109.22,  87.48, 110.85])

In [8]:
# Understand the data types 
cols = [c for c in train.columns if 'X' in c]
print('Number of features: {}'.format(len(cols)))
print('Feature types:')
train[cols].dtypes.value_counts()

Number of features: 376
Feature types:


int64     368
object      8
Name: count, dtype: int64

In [9]:
# Count the data in each of the columns

counts = [[], [], []]
for c in cols:
    typ = train[c].dtype
    uniq = len(np.unique(train[c]))
    if uniq == 1:
        counts[0].append(c)
    elif uniq == 2 and typ == np.int64:
        counts[1].append(c)
    else:
        counts[2].append(c)
print('Constant features: {} Binary features: {} Categorical features: {}\n'
 .format(*[len(c) for c in counts]))
print('Constant features:', counts[0])
print('Categorical features:', counts[2])

Constant features: 12 Binary features: 356 Categorical features: 8

Constant features: ['X11', 'X93', 'X107', 'X233', 'X235', 'X268', 'X289', 'X290', 'X293', 'X297', 'X330', 'X347']
Categorical features: ['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


In [10]:
# Splitting the data
usable_columns = list(set(train.columns) - set(['ID', 'y']))
y_train = train['y'].values
id_test = test['ID'].values
x_train = train[usable_columns]
x_test = test[usable_columns]

#### Check for null values

In [11]:
def check_missing_values(df):
    if df.isnull().any().any():
        print('There are missing values in the dataframe')
    else:
        print('There are no missing values in the dataframe')

In [12]:
check_missing_values(x_train)
check_missing_values(x_test)

There are no missing values in the dataframe
There are no missing values in the dataframe
There are no missing values in the dataframe


#### Label Encoding

In [13]:
for column in usable_columns:
    cardinality = len(np.unique(x_train[column]))
    if cardinality == 1:
        x_train.drop(column, axis=1) # Column with only one 
        # value is useless so we drop it
        x_test.drop(column, axis=1)
    if cardinality > 2: # Column is categorical
        mapper = lambda x: sum([ord(digit) for digit in x])
        x_train[column] = x_train[column].apply(mapper)
        x_test[column] = x_test[column].apply(mapper)
x_train.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[column] = x_train[column].apply(mapper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_test[column] = x_test[column].apply(mapper)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x_train[column] = x_train[column].apply(mapper)
A value is trying to be set on a copy of a slice from a DataFra

Unnamed: 0,X213,X93,X197,X108,X328,X215,X216,X302,X336,X112,...,X8,X59,X64,X96,X183,X29,X161,X297,X106,X84
0,0,0,0,0,0,0,0,0,0,0,...,111,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,1,0,...,111,0,0,1,0,0,0,0,0,0
2,0,0,0,0,1,0,0,0,0,0,...,120,0,0,1,0,1,0,0,0,1
3,0,0,0,1,1,0,0,0,0,0,...,101,0,0,1,0,1,0,0,0,1
4,0,0,0,1,1,0,0,0,0,0,...,110,0,0,1,0,1,0,0,0,0


In [14]:
# Make sure the data is changed into numerical values

print('Feature types:')
x_train[cols].dtypes.value_counts()

Feature types:


int64    376
Name: count, dtype: int64

### Use PCA

In [15]:
n_comp = 12
pca = PCA(n_components = n_comp,random_state = 420)
pca2_results_train = pca.fit_transform(x_train)
pca2_results_test = pca.transform(x_test)

### XGBoost (Train and Predict)

In [16]:
# Training using XGBoost
import xgboost as xgb
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [17]:
x_train,x_val,y_train,y_val = train_test_split(pca2_results_train, y_train, test_size=0.2, random_state=4242)

In [18]:
d_train = xgb.DMatrix(x_train,label = y_train)
d_val = xgb.DMatrix(x_val,label = y_val)

# dtest = xgb.DMatrix(x_test)

d_test = xgb.DMatrix(pca2_results_test)

In [19]:
params = {}
params['objective'] = 'reg:linear'
params['eta'] = 0.02
params['max_depth'] = 4

def xgb_r2_score(preds, dtrain):
    labels = dtrain.get_label()
    return 'r2', r2_score(labels, preds)
watchlist = [(d_train, 'train'), (d_val, 'valid')]
clf = xgb.train(params, d_train,  1000, watchlist, early_stopping_rounds=50, 
 feval=xgb_r2_score, maximize=True, verbose_eval=10)

[0]	train-rmse:12.78419	train-r2:0.01323	valid-rmse:11.78153	valid-r2:0.01330


[10]	train-rmse:12.06994	train-r2:0.12041	valid-rmse:10.91819	valid-r2:0.15261
[20]	train-rmse:11.54918	train-r2:0.19467	valid-rmse:10.30539	valid-r2:0.24506
[30]	train-rmse:11.15051	train-r2:0.24931	valid-rmse:9.85652	valid-r2:0.30940




[40]	train-rmse:10.85067	train-r2:0.28914	valid-rmse:9.54281	valid-r2:0.35266
[50]	train-rmse:10.61444	train-r2:0.31976	valid-rmse:9.31125	valid-r2:0.38369
[60]	train-rmse:10.42326	train-r2:0.34404	valid-rmse:9.12620	valid-r2:0.40795
[70]	train-rmse:10.17545	train-r2:0.37486	valid-rmse:8.94663	valid-r2:0.43101
[80]	train-rmse:9.98947	train-r2:0.39750	valid-rmse:8.80772	valid-r2:0.44855
[90]	train-rmse:9.81691	train-r2:0.41814	valid-rmse:8.69924	valid-r2:0.46205
[100]	train-rmse:9.67192	train-r2:0.43520	valid-rmse:8.61812	valid-r2:0.47203
[110]	train-rmse:9.55429	train-r2:0.44885	valid-rmse:8.55682	valid-r2:0.47952
[120]	train-rmse:9.45304	train-r2:0.46047	valid-rmse:8.50657	valid-r2:0.48561
[130]	train-rmse:9.35199	train-r2:0.47195	valid-rmse:8.47482	valid-r2:0.48945
[140]	train-rmse:9.26891	train-r2:0.48129	valid-rmse:8.44559	valid-r2:0.49296
[150]	train-rmse:9.19067	train-r2:0.49001	valid-rmse:8.42276	valid-r2:0.49570
[160]	train-rmse:9.09980	train-r2:0.50004	valid-rmse:8.41240	valid

#### Predict using XGBoost

In [20]:
p_test = clf.predict(d_test)

In [21]:
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = p_test
sub.to_csv('test_df.csv', index = False)
sub.head()

Unnamed: 0,ID,y
0,1,82.535706
1,2,94.85804
2,3,81.812897
3,4,76.758972
4,5,113.576324
