### Read the dataset `vehicle.csv` and drop NaNs if any

In [91]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 
import warnings
warnings.filterwarnings('ignore')

In [92]:
# Numerical libraries
import numpy as np   

# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
# to handle data in form of rows and columns 
import pandas as pd    

# importing ploting libraries
import matplotlib.pyplot as plt   
import matplotlib.style
plt.style.use('classic')

#importing seaborn for statistical plots
import seaborn as sns

In [93]:
# reading the CSV file into pandas dataframe
veh_df = pd.read_csv("vehicle.csv").dropna()

In [94]:
veh_df.head()

Unnamed: 0,compactness,circularity,distance_circularity,radius_ratio,pr.axis_aspect_ratio,max.length_aspect_ratio,scatter_ratio,elongatedness,pr.axis_rectangularity,max.length_rectangularity,scaled_variance,scaled_variance.1,scaled_radius_of_gyration,scaled_radius_of_gyration.1,skewness_about,skewness_about.1,skewness_about.2,hollows_ratio,class
0,95,48.0,83.0,178.0,72.0,10,162.0,42.0,20.0,159,176.0,379.0,184.0,70.0,6.0,16.0,187.0,197,van
1,91,41.0,84.0,141.0,57.0,9,149.0,45.0,19.0,143,170.0,330.0,158.0,72.0,9.0,14.0,189.0,199,van
2,104,50.0,106.0,209.0,66.0,10,207.0,32.0,23.0,158,223.0,635.0,220.0,73.0,14.0,9.0,188.0,196,car
3,93,41.0,82.0,159.0,63.0,9,144.0,46.0,19.0,143,160.0,309.0,127.0,63.0,6.0,10.0,199.0,207,van
4,85,44.0,70.0,205.0,103.0,52,149.0,45.0,19.0,144,241.0,325.0,188.0,127.0,9.0,11.0,180.0,183,bus


In [95]:
veh_df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
compactness,813.0,93.656827,8.233751,73.0,87.0,93.0,100.0,119.0
circularity,813.0,44.803198,6.146659,33.0,40.0,44.0,49.0,59.0
distance_circularity,813.0,82.04305,15.78307,40.0,70.0,79.0,98.0,112.0
radius_ratio,813.0,169.098401,33.615402,104.0,141.0,167.0,195.0,333.0
pr.axis_aspect_ratio,813.0,61.774908,7.973,47.0,57.0,61.0,65.0,138.0
max.length_aspect_ratio,813.0,8.599016,4.677174,2.0,7.0,8.0,10.0,55.0
scatter_ratio,813.0,168.563346,33.082186,112.0,146.0,157.0,198.0,265.0
elongatedness,813.0,40.98893,7.80338,26.0,33.0,43.0,46.0,61.0
pr.axis_rectangularity,813.0,20.558426,2.573184,17.0,19.0,20.0,23.0,29.0
max.length_rectangularity,813.0,147.891759,14.504648,118.0,137.0,146.0,159.0,188.0


### LabelEncode the `class` column

In [96]:
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
veh_df['classid'] = labelencoder.fit_transform(veh_df['class'])
veh_df['classid'].head(5)

0    2
1    2
2    1
3    2
4    0
Name: classid, dtype: int64

In [97]:
veh_df = veh_df.drop('class', axis=1)

In [98]:
veh_df.head()

Unnamed: 0,compactness,circularity,distance_circularity,radius_ratio,pr.axis_aspect_ratio,max.length_aspect_ratio,scatter_ratio,elongatedness,pr.axis_rectangularity,max.length_rectangularity,scaled_variance,scaled_variance.1,scaled_radius_of_gyration,scaled_radius_of_gyration.1,skewness_about,skewness_about.1,skewness_about.2,hollows_ratio,classid
0,95,48.0,83.0,178.0,72.0,10,162.0,42.0,20.0,159,176.0,379.0,184.0,70.0,6.0,16.0,187.0,197,2
1,91,41.0,84.0,141.0,57.0,9,149.0,45.0,19.0,143,170.0,330.0,158.0,72.0,9.0,14.0,189.0,199,2
2,104,50.0,106.0,209.0,66.0,10,207.0,32.0,23.0,158,223.0,635.0,220.0,73.0,14.0,9.0,188.0,196,1
3,93,41.0,82.0,159.0,63.0,9,144.0,46.0,19.0,143,160.0,309.0,127.0,63.0,6.0,10.0,199.0,207,2
4,85,44.0,70.0,205.0,103.0,52,149.0,45.0,19.0,144,241.0,325.0,188.0,127.0,9.0,11.0,180.0,183,0


### Divide the feature set and target set

In [99]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = veh_df.drop(['classid'], axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
Y = veh_df[['classid']]

In [100]:
X.head()

Unnamed: 0,compactness,circularity,distance_circularity,radius_ratio,pr.axis_aspect_ratio,max.length_aspect_ratio,scatter_ratio,elongatedness,pr.axis_rectangularity,max.length_rectangularity,scaled_variance,scaled_variance.1,scaled_radius_of_gyration,scaled_radius_of_gyration.1,skewness_about,skewness_about.1,skewness_about.2,hollows_ratio
0,95,48.0,83.0,178.0,72.0,10,162.0,42.0,20.0,159,176.0,379.0,184.0,70.0,6.0,16.0,187.0,197
1,91,41.0,84.0,141.0,57.0,9,149.0,45.0,19.0,143,170.0,330.0,158.0,72.0,9.0,14.0,189.0,199
2,104,50.0,106.0,209.0,66.0,10,207.0,32.0,23.0,158,223.0,635.0,220.0,73.0,14.0,9.0,188.0,196
3,93,41.0,82.0,159.0,63.0,9,144.0,46.0,19.0,143,160.0,309.0,127.0,63.0,6.0,10.0,199.0,207
4,85,44.0,70.0,205.0,103.0,52,149.0,45.0,19.0,144,241.0,325.0,188.0,127.0,9.0,11.0,180.0,183


In [101]:
Y.head()

Unnamed: 0,classid
0,2
1,2
2,1
3,2
4,0


### Split into train and test data  70:30 

In [102]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.30, random_state=1)

### Get the coefficients for each feature using Linear, Ridge and Lasso regression

In [103]:
regression_model = LinearRegression()
regression_model.fit(X_train, Y_train)
for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for compactness is 0.02015486101899201
The coefficient for circularity is -0.136107554303993
The coefficient for distance_circularity is 0.035024709916920124
The coefficient for radius_ratio is 0.018002820582637677
The coefficient for pr.axis_aspect_ratio is -0.057633158320484545
The coefficient for max.length_aspect_ratio is 0.014749171924620155
The coefficient for scatter_ratio is -0.06177967989389903
The coefficient for elongatedness is 0.08391282100510365
The coefficient for pr.axis_rectangularity is -0.005045368763027413
The coefficient for max.length_rectangularity is 0.06389203229929888
The coefficient for scaled_variance is 0.003353301629332047
The coefficient for scaled_variance.1 is 0.007401936727248855
The coefficient for scaled_radius_of_gyration is -0.0004010938444042226
The coefficient for scaled_radius_of_gyration.1 is 0.01052215013908726
The coefficient for skewness_about is 0.006912615376116697
The coefficient for skewness_about.1 is -0.0102878366367285

In [104]:
# To scale the dimensions we need scale function which is part of sckikit preprocessing libraries
from sklearn import preprocessing
# scale all the columns of the mpg_df. This will produce a numpy array
veh_df_scaled = preprocessing.scale(veh_df)

In [105]:
veh_df_scaled = pd.DataFrame(veh_df_scaled, columns=veh_df.columns)

In [106]:
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X_scaled = veh_df_scaled.drop(['classid'], axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
Y_scaled = veh_df['classid']

In [107]:
X_scaled_train, X_scaled_test, Y_scaled_train, Y_scaled_test = train_test_split(X_scaled, Y_scaled, test_size=0.30, random_state=1)

In [108]:
ridge = Ridge(alpha=.5)
ridge.fit(X_scaled_train, Y_scaled_train) #Always use scaled data for ridge
for idx, col_name in enumerate(X_scaled_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for compactness is 0.02015486101899201
The coefficient for circularity is -0.136107554303993
The coefficient for distance_circularity is 0.035024709916920124
The coefficient for radius_ratio is 0.018002820582637677
The coefficient for pr.axis_aspect_ratio is -0.057633158320484545
The coefficient for max.length_aspect_ratio is 0.014749171924620155
The coefficient for scatter_ratio is -0.06177967989389903
The coefficient for elongatedness is 0.08391282100510365
The coefficient for pr.axis_rectangularity is -0.005045368763027413
The coefficient for max.length_rectangularity is 0.06389203229929888
The coefficient for scaled_variance is 0.003353301629332047
The coefficient for scaled_variance.1 is 0.007401936727248855
The coefficient for scaled_radius_of_gyration is -0.0004010938444042226
The coefficient for scaled_radius_of_gyration.1 is 0.01052215013908726
The coefficient for skewness_about is 0.006912615376116697
The coefficient for skewness_about.1 is -0.0102878366367285

In [109]:
lasso = Lasso(alpha=.01)
lasso.fit(X_scaled_train, Y_scaled_train)
for idx, col_name in enumerate(X_scaled_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))

The coefficient for compactness is 0.02015486101899201
The coefficient for circularity is -0.136107554303993
The coefficient for distance_circularity is 0.035024709916920124
The coefficient for radius_ratio is 0.018002820582637677
The coefficient for pr.axis_aspect_ratio is -0.057633158320484545
The coefficient for max.length_aspect_ratio is 0.014749171924620155
The coefficient for scatter_ratio is -0.06177967989389903
The coefficient for elongatedness is 0.08391282100510365
The coefficient for pr.axis_rectangularity is -0.005045368763027413
The coefficient for max.length_rectangularity is 0.06389203229929888
The coefficient for scaled_variance is 0.003353301629332047
The coefficient for scaled_variance.1 is 0.007401936727248855
The coefficient for scaled_radius_of_gyration is -0.0004010938444042226
The coefficient for scaled_radius_of_gyration.1 is 0.01052215013908726
The coefficient for skewness_about is 0.006912615376116697
The coefficient for skewness_about.1 is -0.0102878366367285

### Get the accuracy scores for train and test data for the above 3 methods

In [110]:
print("Regression train scores:", regression_model.score(X_train, Y_train))
print("Regression test scores:", regression_model.score(X_test, Y_test))

Regression train scores: 0.6978276964890503
Regression test scores: 0.6242685601012292


In [111]:
print("Ridge train scores:", ridge.score(X_scaled_train, Y_scaled_train))
print("Ridge test scores:", ridge.score(X_scaled_test, Y_scaled_test))

Ridge train scores: 0.6957505082124492
Ridge test scores: 0.6334123366535915


In [112]:
print("Lasso train scores:", lasso.score(X_scaled_train, Y_scaled_train))
print("Lasso test scores:", lasso.score(X_scaled_test, Y_scaled_test))

Lasso train scores: 0.651060390694004
Lasso test scores: 0.6198817945476454


### Ensemble methods

### Now, get train and test scores using `BaggingClassifier`, `GradientBoostingClassifier` and `RandomForestClassifier` and mention the highest accuracy among all

In [113]:
from sklearn.ensemble import BaggingClassifier
bgcl = BaggingClassifier(n_estimators=100, max_samples=.50 , oob_score=True)
bgcl = bgcl.fit(X_train, Y_train)
print("Bagging Classifier Training Accuracy:", bgcl.oob_score_)

Bagging Classifier Training Accuracy: 0.9226713532513181


In [114]:
bgcl = bgcl.fit(X_test, Y_test)
print("Bagging Classifier Test Accuracy:", bgcl.oob_score_)

Bagging Classifier Test Accuracy: 0.9180327868852459


In [115]:
from sklearn.ensemble import GradientBoostingClassifier
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.09, max_depth=5)
gbcl = gbcl.fit(X_train, Y_train)

print("Gradient Boosting Training Accuracy:", gbcl.score(X_train , Y_train))
print("Gradient Boosting Test Accuracy:", gbcl.score(X_test , Y_test))

Gradient Boosting Training Accuracy: 1.0
Gradient Boosting Test Accuracy: 0.9631147540983607


In [116]:
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 6)
rfcl = rfcl.fit(X_train, Y_train)
print("Random Forest Training Accuracy:", rfcl.score(X_train , Y_train))
print("Random Forest Test Accuracy:", rfcl.score(X_test , Y_test))

Random Forest Training Accuracy: 0.9876977152899824
Random Forest Test Accuracy: 0.9016393442622951


# Both Random Forest and Gradient Boosting give the same accuracy which is highest 

In [117]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [118]:
rfcl = RandomForestClassifier(random_state=1)
bgcl = BaggingClassifier( n_estimators=10)
gbcl = GradientBoostingClassifier(n_estimators = 50, learning_rate = 0.09, max_depth=5)

In [119]:
stack = VotingClassifier(estimators = [('Bagging',bgcl),('RandomForest',rfcl), ('gbcl', gbcl)], voting = 'hard')

In [120]:
for clf, label in zip([rfcl, bgcl, gbcl], ['RandomForest', 'BaggingClassifier', 'GradientBoosting']):
    scores = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
    print("Train Accuracy: %0.02f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label ))

Train Accuracy: 0.91 (+/- 0.02) [RandomForest]
Train Accuracy: 0.91 (+/- 0.03) [BaggingClassifier]
Train Accuracy: 0.92 (+/- 0.02) [GradientBoosting]


In [121]:
for clf, label in zip([rfcl, bgcl, gbcl], ['RandomForest', 'BaggingClassifier', 'GradientBoosting']):
    scores = cross_val_score(clf, X_test, Y_test, cv=5, scoring='accuracy')
    print("Test Accuracy: %0.02f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label ))


Test Accuracy: 0.90 (+/- 0.04) [RandomForest]
Test Accuracy: 0.89 (+/- 0.04) [BaggingClassifier]
Test Accuracy: 0.89 (+/- 0.01) [GradientBoosting]
