# Problem statement : 
You are required to reduce the time that cars spend on the test bench. Others will work with a dataset representing different permutations of features in a Mercedes-Benz car to predict the time it takes to pass testing. Optimal algorithms will contribute to faster testing, resulting in lower carbon dioxide emissions without reducing Mercedes-Benz’s standards.
Following actions should be performed:
•	If for any column(s), the variance is equal to zero, then you need to remove those variable(s).
•	Check for null and unique values for test and train sets.
•	Apply label encoder.
•	Perform dimensionality reduction.
•	Predict your test_df values using XGBoost.


In [60]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import r2_score
import xgboost
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [55]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [56]:
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 378 entries, ID to X385
dtypes: float64(1), int64(369), object(8)
memory usage: 12.1+ MB
None


In [57]:
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4209 entries, 0 to 4208
Columns: 377 entries, ID to X385
dtypes: int64(369), object(8)
memory usage: 12.1+ MB
None


### Take variance of numeric columns only

In [5]:
train_df_numeric = train_df.var(numeric_only=True)

In [7]:
train_df_numeric.head()

ID     5.941936e+06
y      1.607667e+02
X10    1.313092e-02
X11    0.000000e+00
X12    6.945713e-02
dtype: float64

In [9]:
cols_to_drop = []
for items in train_df_numeric.iteritems():
    if items[1]==0:
        print(items[1])
        cols_to_drop.append(items[0])

0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [10]:
cols_to_drop

['X11',
 'X93',
 'X107',
 'X233',
 'X235',
 'X268',
 'X289',
 'X290',
 'X293',
 'X297',
 'X330',
 'X347']

### Remove zero variance columns from the test and train dataset

In [13]:

for column in cols_to_drop:
    train_df.drop(column,axis=1, inplace=True)
    test_df.drop(column,axis=1, inplace=True)

In [14]:
print(train_df.shape, test_df.shape)

(4209, 366) (4209, 365)


In [15]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# checking for null and unique values
# ID and y are not needed.
usable_cols = list(set(train_df.columns)- set(['ID','y']))

In [19]:
usable_cols

['X360',
 'X336',
 'X111',
 'X270',
 'X350',
 'X219',
 'X84',
 'X299',
 'X160',
 'X61',
 'X334',
 'X198',
 'X239',
 'X136',
 'X262',
 'X287',
 'X349',
 'X373',
 'X245',
 'X70',
 'X22',
 'X73',
 'X135',
 'X80',
 'X3',
 'X354',
 'X159',
 'X126',
 'X313',
 'X94',
 'X110',
 'X133',
 'X101',
 'X122',
 'X152',
 'X310',
 'X98',
 'X30',
 'X99',
 'X255',
 'X19',
 'X338',
 'X32',
 'X376',
 'X112',
 'X328',
 'X33',
 'X128',
 'X195',
 'X4',
 'X26',
 'X90',
 'X43',
 'X222',
 'X357',
 'X298',
 'X62',
 'X291',
 'X81',
 'X113',
 'X8',
 'X14',
 'X251',
 'X345',
 'X226',
 'X369',
 'X44',
 'X147',
 'X323',
 'X234',
 'X302',
 'X177',
 'X46',
 'X15',
 'X295',
 'X331',
 'X181',
 'X143',
 'X326',
 'X55',
 'X318',
 'X208',
 'X92',
 'X209',
 'X321',
 'X320',
 'X56',
 'X175',
 'X41',
 'X163',
 'X359',
 'X382',
 'X97',
 'X167',
 'X252',
 'X109',
 'X83',
 'X240',
 'X48',
 'X60',
 'X16',
 'X223',
 'X274',
 'X118',
 'X79',
 'X292',
 'X58',
 'X139',
 'X218',
 'X42',
 'X186',
 'X178',
 'X244',
 'X362',
 'X189',
 'X11

In [20]:
for column in usable_cols:
    cardinality = len(np.unique(train_df[column]))
    if cardinality == 1:
        train_df.drop(column, axis=1, inplace=True)
        test_df.drop(column, axis=1, inplace=True)


In [21]:
print(train_df.shape, test_df.shape)

(4209, 366) (4209, 365)


In [22]:
train_df.columns

Index(['ID', 'y', 'X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8',
       ...
       'X375', 'X376', 'X377', 'X378', 'X379', 'X380', 'X382', 'X383', 'X384',
       'X385'],
      dtype='object', length=366)

In [23]:
train_df.isnull().sum()

ID      0
y       0
X0      0
X1      0
X2      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 366, dtype: int64

In [24]:
test_df.isnull().sum()

ID      0
X0      0
X1      0
X2      0
X3      0
       ..
X380    0
X382    0
X383    0
X384    0
X385    0
Length: 365, dtype: int64

In [25]:
def missing_values(df):
    cols_nan_count = {}
    for col in df.columns:
        nan_count = df[col].isnull().sum()
        if nan_count > 0:
            cols_nan_count[col] = nan_count
    if len(cols_nan_count.items()) == 0:
        print("No missing values")
    else:
        print(cols_nan_count)
    return cols_nan_count

In [26]:
missing_values(train_df)

No missing values


{}

In [27]:
missing_values(test_df)

No missing values


{}

## There are no missing values in train and test dataset.

In [29]:
# Label encoding
categorical_cols = []
for c in train_df.columns:
    if train_df[c].dtype == 'object':
        categorical_cols.append(c)
print(categorical_cols)
    

['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8']


In [32]:
train_df.dtypes

ID        int64
y       float64
X0       object
X1       object
X2       object
         ...   
X380      int64
X382      int64
X383      int64
X384      int64
X385      int64
Length: 366, dtype: object

In [33]:
for c in train_df.columns:
    if train_df[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train_df[c].values)+ list(test_df[c].values))
        train_df[c] = lbl.transform(list(train_df[c].values))
        test_df[c] = lbl.transform(list(test_df[c].values))


In [34]:
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,37,23,20,0,3,27,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,37,21,22,4,3,31,11,14,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,24,24,38,2,3,30,9,23,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,24,21,38,5,3,30,11,4,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,24,23,38,5,3,14,3,13,...,0,0,0,0,0,0,0,0,0,0


In [36]:
test_df.head()

Unnamed: 0,ID,X0,X1,X2,X3,X4,X5,X6,X8,X10,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,1,24,23,38,5,3,26,0,22,0,...,0,0,0,1,0,0,0,0,0,0
1,2,46,3,9,0,3,9,6,24,0,...,0,0,1,0,0,0,0,0,0,0
2,3,24,23,19,5,3,0,9,9,0,...,0,0,0,1,0,0,0,0,0,0
3,4,24,13,38,5,3,32,11,13,0,...,0,0,0,1,0,0,0,0,0,0
4,5,49,20,19,2,3,31,8,12,0,...,1,0,0,0,0,0,0,0,0,0


### Dimensionality Reduction using Pricipal Component Analysis

In [38]:
n_comp = 50
pca = PCA(n_components = n_comp, random_state=230)
pca_train_df = pca.fit_transform(train_df.drop(['y'],axis =1))
pca_test_df = pca.transform(test_df)

for i in range(1, n_comp+1):
    train_df['pca_' + str(i)] = pca_train_df[:,i-1]
    test_df['pca_' + str(i)] = pca_test_df[:,i-1]

In [42]:
print(pca_train_df.shape, pca_test_df.shape)

(4209, 50) (4209, 50)


### Predict values using XGBoost

In [46]:
from xgboost import XGBRegressor

y = train_df['y'].values
y
seed =9
test_size = 0.3
X_train,X_test,y_train,y_test = train_test_split(pca_train_df,y, test_size=test_size,random_state=seed)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(2946, 50) (1263, 50) (2946,) (1263,)


In [47]:
model = XGBRegressor(max_depth = 6, learning_rate = 0.05, n_estimators =500)
model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.05, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [49]:
y_preds = model.predict(X_test)

In [58]:
np.sqrt(mean_squared_error(y_test,y_preds))

9.083829225213378

In [51]:
mean_absolute_error(y_test,y_preds)  

6.05821543593116

In [61]:
r2_score(y_test,y_preds)

0.4574320257075055

In [52]:
y_preds_test_df = model.predict(pca_test_df)

In [53]:
y_preds_test_df

array([ 79.47638 ,  94.204254,  80.3167  , ...,  94.2734  , 110.32784 ,
        97.04008 ], dtype=float32)

# Insights :
### 1. There are no missing values in train and test dataset.
### 2. PCA reduced the dimension to 50 from 365
### 3. Using XGBoost the RMSE is 9.084 which is small.
### 4. Lower the RMSE value, better is the model.
