# Decision Tree & Ensemble Modelling.

In [1]:
import pandas as pd
import numpy as np

### Creating The New Data Frame

In [2]:
cars_data = pd.read_csv(r"D:\data science\Python - Anaconda\Decision Tree & Ensamble modeling\cars.csv",
                       header = None)

cars_data.head()

Unnamed: 0,0,1,2,3,4,5,6
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
cars_data.shape
cars_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       1728 non-null   object
 1   1       1728 non-null   object
 2   2       1728 non-null   object
 3   3       1728 non-null   object
 4   4       1728 non-null   object
 5   5       1728 non-null   object
 6   6       1728 non-null   object
dtypes: object(7)
memory usage: 94.6+ KB


In [4]:
cars_data.columns = ["buying","maint","doors","persons","lug_boot","safety","classes"]

In [5]:
cars_data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


### Feature Selection.

In [6]:
# All the variables in the data are important for the classification.
# Thus we have not removed any of the varibles.

### Handling the Missing Values.

In [7]:
cars_data.isnull().sum()

buying      0
maint       0
doors       0
persons     0
lug_boot    0
safety      0
classes     0
dtype: int64

In [8]:
# Thus the data frame does not have any missing values.

### Handling the outliers.

In [9]:
# Decision Tree is robust to the outliers.
# Thus in this algorithm we do not have to handle the outliers. 

### Converting categorical variable into numerical variable.

#### Making the second copy of the data type.

In [10]:
cars_df = pd.DataFrame.copy(cars_data)

In [11]:
colname = cars_df.columns
colname

Index(['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'classes'], dtype='object')

In [12]:
#from sklearn.preprocessing import LabelEncoder

#le = LabelEncoder()

#for i in colname:
#    cars_df[i] = le.fit_transform(cars_df[i])

In [13]:
#cars_df.head()

#### Now to understand what kind of encoding is done for what class we will use following code

In [14]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for i in colname:
    cars_df[i] = le.fit_transform(cars_df[i])
    le_name_mapping = list(zip(le.classes_, le.transform(le.classes_)))
    print("Feature", i)
    print("Mapping", le_name_mapping)

Feature buying
Mapping [('high', 0), ('low', 1), ('med', 2), ('vhigh', 3)]
Feature maint
Mapping [('high', 0), ('low', 1), ('med', 2), ('vhigh', 3)]
Feature doors
Mapping [('2', 0), ('3', 1), ('4', 2), ('5more', 3)]
Feature persons
Mapping [('2', 0), ('4', 1), ('more', 2)]
Feature lug_boot
Mapping [('big', 0), ('med', 1), ('small', 2)]
Feature safety
Mapping [('high', 0), ('low', 1), ('med', 2)]
Feature classes
Mapping [('acc', 0), ('good', 1), ('unacc', 2), ('vgood', 3)]


In [15]:
cars_df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,classes
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
...,...,...,...,...,...,...,...
1723,1,1,3,2,1,2,1
1724,1,1,3,2,1,0,3
1725,1,1,3,2,0,1,2
1726,1,1,3,2,0,2,1


In [16]:
cars_df.classes.value_counts()

2    1210
0     384
1      69
3      65
Name: classes, dtype: int64

In [17]:
# For Dependent variable we need to know the encoding.

# Thus for Classes
# acc --> "Accurate" --> 0       --> 384
# good --> 1                     --> 69
# unacc --> unaccurate --> 2     --> 1210
# vgood --> Very Good --> 3      --> 65

### Create X & Y

In [18]:
X = cars_df.values[:,:-1]
Y = cars_df.values[:,-1]
Y = Y.astype(int)

In [19]:
# Now to check if X and Y have been created properly or not.

In [20]:
print(X)

[[3 3 0 0 2 1]
 [3 3 0 0 2 2]
 [3 3 0 0 2 0]
 ...
 [1 1 3 2 0 1]
 [1 1 3 2 0 2]
 [1 1 3 2 0 0]]


In [21]:
print(Y)

[2 2 2 ... 2 1 3]


### Scaling the data (X)

In [22]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

scaler.fit(X)

X = scaler.transform(X)

In [23]:
# To check if data has been Scaled or not.
print(X)

[[ 1.34164079  1.34164079 -1.34164079 -1.22474487  1.22474487  0.        ]
 [ 1.34164079  1.34164079 -1.34164079 -1.22474487  1.22474487  1.22474487]
 [ 1.34164079  1.34164079 -1.34164079 -1.22474487  1.22474487 -1.22474487]
 ...
 [-0.4472136  -0.4472136   1.34164079  1.22474487 -1.22474487  0.        ]
 [-0.4472136  -0.4472136   1.34164079  1.22474487 -1.22474487  1.22474487]
 [-0.4472136  -0.4472136   1.34164079  1.22474487 -1.22474487 -1.22474487]]


### Spliting the data

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.3, random_state = 10)

In [25]:
# now to check if the split is successful or not?

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(1209, 6) (519, 6) (1209,) (519,)


In [26]:
# From above shapes we can say that train and test split is successful.

### Building the Model

In [27]:
from sklearn.tree import DecisionTreeClassifier

# Create the model
model_DecisionTree = DecisionTreeClassifier(criterion = "gini", random_state = 10)

# Train the model
model_DecisionTree.fit(X_train, Y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=10, splitter='best')

In [28]:
# predict using model
Y_pred = model_DecisionTree.predict(X_test)
print(list(zip(Y_test,Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0),

### Evaluating the model

In [29]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[101   0   1   0]
 [  2  19   0   0]
 [  0   0 371   0]
 [  1   0   0  24]] 

Accuracy score =  0.9922928709055877 

Classification Report = 
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       102
           1       1.00      0.90      0.95        21
           2       1.00      1.00      1.00       371
           3       1.00      0.96      0.98        25

    accuracy                           0.99       519
   macro avg       0.99      0.96      0.98       519
weighted avg       0.99      0.99      0.99       519



In [30]:
# Thus from above we can see that the model is giving 99.22% accuracy
# Thus the model build is quit good.

### Plotting the Decision Tree Model.

In [31]:
from sklearn.tree import export_graphviz

with open(r"D:\data science\Python - Anaconda\Decision Tree & Ensamble modeling\model_DecisionTree.txt", "w") as f:
    f = export_graphviz(model_DecisionTree, feature_names = cars_df.columns[:-1], out_file = f)
    
# Generate the file and upload the code in the file on "webgraphviz.com" to plot the decision tree.

#### Lets find out the Feature Importance.

In [32]:
print(list(zip(cars_df.columns, model_DecisionTree.feature_importances_)))

[('buying', 0.1510848831946676), ('maint', 0.2506508516803624), ('doors', 0.060026331736828115), ('persons', 0.19355707150872045), ('lug_boot', 0.09892620952419463), ('safety', 0.2457546523552268)]


In [33]:
# Thus from above code we can see which feature or variable of the dataset is more important in predicting the classes.

#### Lets also find out the Basic Decision Tree model Depth.

In [34]:
model_DecisionTree.get_depth()

14

In [35]:
# Thus from above we can see that the total depth of the Decision Tree is equal to 14.

### Tunning the Model

In [36]:
# Lets check and see what kind of results we get after tunning the Decision Tree. 
# Conditions to be put on Tunned Model:
# 1) Max_depth of Decision Tree = 10
# 2) Min_Sample_leaf = 5

In [37]:
from sklearn.tree import DecisionTreeClassifier

# Build the model
model_tunn_DecisionTree = DecisionTreeClassifier(criterion = "gini", min_samples_leaf= 5, max_depth= 10, random_state = 10)

# Train the model
model_tunn_DecisionTree.fit(X_train, Y_train)

# Predict using the model
Y_pred = model_tunn_DecisionTree.predict(X_test)

In [38]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 1), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 0), (2, 2), (2, 2), (0, 1), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 1), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 1),

### Evaluating the Tunned Decision Tree Model.

In [39]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[ 92   6   4   0]
 [  0  17   2   2]
 [  8   0 363   0]
 [  3   5   0  17]] 

Accuracy Score =  0.9421965317919075 

Classification Report = 
              precision    recall  f1-score   support

           0       0.89      0.90      0.90       102
           1       0.61      0.81      0.69        21
           2       0.98      0.98      0.98       371
           3       0.89      0.68      0.77        25

    accuracy                           0.94       519
   macro avg       0.84      0.84      0.84       519
weighted avg       0.95      0.94      0.94       519



In [40]:
# From Above results we can see that the accuracy of the model = 94.21% which is less that the basic model.
# Thus from this we can conclude that the basic model is better than the tunned model.

#### Lets find out the Tunned Decision Tree Model Depth.

In [41]:
print(model_tunn_DecisionTree.get_depth())

10


In [42]:
# Thus from above we can see that the tunned model was limited to the depth of 10 as per the parameter.

### Plotting the Tunned Model.

In [43]:
from sklearn.tree import export_graphviz

with open(r"D:\data science\Python - Anaconda\Decision Tree & Ensamble modeling\model_tunn_DecisionTree.txt", "w") as f:
    f = export_graphviz(model_tunn_DecisionTree, feature_names = cars_df.columns[:-1], out_file = f)
    
# Generate the file and upload the code in the file on "webgraphviz.com" to plot the decision tree.

### Building the SVM(Support Vector Machine).

In [44]:
from sklearn.svm import SVC

# Build the model
classifier = SVC(kernel = "rbf", gamma = 0.1, C= 1)

# Traing the model
classifier.fit(X_train, Y_train)

# Predict using the model
Y_pred = classifier.predict(X_test)

In [45]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 0), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 0), (2, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 0), (3, 3), (1, 0), (2, 2), (0, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 0), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 0), (2, 2), (2, 2), (2, 2), (1, 0), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 0), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 2), (2, 2), (2, 2), (1, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0),

### Evaluating The SVM model.

In [46]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[ 82   0  20   0]
 [ 21   0   0   0]
 [ 17   0 354   0]
 [ 17   0   0   8]] 

Accuracy Score =  0.8554913294797688 

Classification Report = 
              precision    recall  f1-score   support

           0       0.60      0.80      0.69       102
           1       0.00      0.00      0.00        21
           2       0.95      0.95      0.95       371
           3       1.00      0.32      0.48        25

    accuracy                           0.86       519
   macro avg       0.64      0.52      0.53       519
weighted avg       0.84      0.86      0.84       519



  _warn_prf(average, modifier, msg_start, len(result))


### Tunning the SVM model 

In [47]:
from sklearn.svm import SVC

# Building the model.
classifier = SVC(kernel = "rbf", gamma = 0.1, C=50)

# Train the model.
classifier.fit(X_train, Y_train)

# Predict using model.
Y_pred = classifier.predict(X_test)

In [48]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_pred, Y_test))
print("Accuracy score = ", accuracy_score(Y_pred, Y_test), "\n")
print("Classification Report = ")
print(classification_report(Y_pred, Y_test))

Confusion Matrix = 
[[100   0   0   0]
 [  1  21   0   0]
 [  1   0 371   0]
 [  0   0   0  25]]
Accuracy score =  0.9961464354527938 

Classification Report = 
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       100
           1       1.00      0.95      0.98        22
           2       1.00      1.00      1.00       372
           3       1.00      1.00      1.00        25

    accuracy                           1.00       519
   macro avg       1.00      0.99      0.99       519
weighted avg       1.00      1.00      1.00       519



In [49]:
# Thus from above we can say tunned SVM model gives accuracy = 99.61% 
# thus tunned SVM model gives much better accuracy than basic SVM model.

#### Why did we put n_neighbors = ? 

In [50]:
int(np.sqrt(len(X_train)))

34

In [51]:

# Thus from above we can say that n_neighbors should be less than 34

In [52]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

for i in range(1,35,1):
    model_KNN = KNeighborsClassifier(n_neighbors = i, metric = "euclidean")
    model_KNN.fit(X_train, Y_train)
    Y_pred = model_KNN.predict(X_test)
    print("For n_neighbors =", i , "the accuracy score =",accuracy_score(Y_pred, Y_test))

For n_neighbors = 1 the accuracy score = 0.9248554913294798
For n_neighbors = 2 the accuracy score = 0.8901734104046243
For n_neighbors = 3 the accuracy score = 0.9113680154142582
For n_neighbors = 4 the accuracy score = 0.8959537572254336
For n_neighbors = 5 the accuracy score = 0.9441233140655106
For n_neighbors = 6 the accuracy score = 0.9267822736030829
For n_neighbors = 7 the accuracy score = 0.9402697495183044
For n_neighbors = 8 the accuracy score = 0.9479768786127167
For n_neighbors = 9 the accuracy score = 0.9402697495183044
For n_neighbors = 10 the accuracy score = 0.9344894026974951
For n_neighbors = 11 the accuracy score = 0.9325626204238922
For n_neighbors = 12 the accuracy score = 0.9210019267822736
For n_neighbors = 13 the accuracy score = 0.9017341040462428
For n_neighbors = 14 the accuracy score = 0.9036608863198459
For n_neighbors = 15 the accuracy score = 0.8882466281310212
For n_neighbors = 16 the accuracy score = 0.8786127167630058
For n_neighbors = 17 the accuracy

In [53]:
# Thus form above we can see that we get highest accuracy score of 94.79 %  at n_neighbors = 8

### Building the KNN model.

In [54]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

# Build the model
model_KNN = KNeighborsClassifier(n_neighbors = 8 , metric = "euclidean")

# Train the model
model_KNN.fit(X_train, Y_train)

# Predict using model
Y_pred = model_KNN.predict(X_test)

In [55]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 0), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 1),

### Evaluating the KNN model.

In [56]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_pred, Y_test))
print("Accuracy score = ", accuracy_score(Y_pred, Y_test), "\n")
print("Classification Report = ")
print(classification_report(Y_pred, Y_test))

Confusion Matrix = 
[[ 97   8  10   3]
 [  1  12   0   0]
 [  4   0 361   0]
 [  0   1   0  22]]
Accuracy score =  0.9479768786127167 

Classification Report = 
              precision    recall  f1-score   support

           0       0.95      0.82      0.88       118
           1       0.57      0.92      0.71        13
           2       0.97      0.99      0.98       365
           3       0.88      0.96      0.92        23

    accuracy                           0.95       519
   macro avg       0.84      0.92      0.87       519
weighted avg       0.95      0.95      0.95       519



In [57]:
# Thus from above we get that accuracy score for KNN = 94.79 % 

### Build the Logistic Regression Model

In [58]:
from sklearn.linear_model import LogisticRegression

# Build the model 
classifier = LogisticRegression()

# Train the model
classifier.fit(X_train, Y_train)

# Predict using model
Y_pred = classifier.predict(X_test)

In [59]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 2), (2, 2), (0, 2), (0, 2), (2, 0), (0, 2), (2, 0), (2, 2), (2, 0), (2, 2), (0, 2), (0, 2), (2, 2), (2, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 3), (2, 2), (2, 2), (2, 2), (2, 0), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 2), (2, 2), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 2), (3, 2), (1, 2), (2, 2), (0, 2), (2, 2), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (3, 0), (2, 2), (2, 2), (0, 2), (0, 0), (2, 2), (2, 2), (3, 0), (2, 2), (2, 2), (2, 2), (1, 2), (2, 2), (0, 2), (3, 2), (2, 2), (2, 2), (2, 0), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 2), (2, 2), (2, 2), (1, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 2),

### Evaluating The Logistic Regression Model.

In [60]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_pred, Y_test))
print("Accuracy score = ", accuracy_score(Y_pred, Y_test), "\n")
print("Classification Report = ")
print(classification_report(Y_pred, Y_test))

Confusion Matrix = 
[[ 19   3  30   8]
 [  0   0   0   0]
 [ 78  18 339  14]
 [  5   0   2   3]]
Accuracy score =  0.6955684007707129 

Classification Report = 
              precision    recall  f1-score   support

           0       0.19      0.32      0.23        60
           1       0.00      0.00      0.00         0
           2       0.91      0.76      0.83       449
           3       0.12      0.30      0.17        10

    accuracy                           0.70       519
   macro avg       0.31      0.34      0.31       519
weighted avg       0.81      0.70      0.75       519



  _warn_prf(average, modifier, msg_start, len(result))


In [61]:
# Thus from above we get accuracy for Logistic Regression Model = 69.55 %

In [62]:
# List of Accuracy Score for all the models:
# 1) Basic Decision Tree            = 99.22 %
# 2) Tunned Decision Tree           = 94.21 %
# 3) Basic SVM                      = 85.54 %
# 4) Tunned SVM                     = 99.61 %
# 5) KNN                            = 94.79 %
# 6) Logistic Regression            = 69.55 %   

In [63]:
# Thus from above information we can say that 
# The best model according to accuracy score is Tunned SVM which gives higgest accuracy of about 99.61% .
# But it is a tunned model and thus has higger complexity.
# but if we look at Basic Decision Tree model it gives accuracy score of about 99.22 % 
# which is almost equal to tunned SVM.
# and Basic Decision tree is not a tunned model thus it is simpler than the tunned SVM.
# Thus we would choose Basic Decision Tree as the best algorithm for deloyment.

## Employing the Bagging Algorithms.

In [64]:
# Bagging Algorithms contains two sub types of it.
# 1) ExtraTreeClassifier.
# 2) RandomForestClassifier.

### 1) Building the ExtraTreeClassifier model.

In [68]:
from sklearn.ensemble import ExtraTreesClassifier

# Build the model
model = ExtraTreesClassifier(n_estimators = 10, random_state=10)

# Train the model
model.fit(X_train, Y_train)

# predict using model
Y_pred = model.predict(X_test)

In [69]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 2), (2, 2), (2, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 1), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0),

### Evaluating the ExtraTreesClassifier model.

In [70]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test,Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[ 95   2   5   0]
 [  2  19   0   0]
 [ 17   0 354   0]
 [  2   1   0  22]] 

Accuracy Score =  0.9441233140655106 

Classification Report = 
              precision    recall  f1-score   support

           0       0.82      0.93      0.87       102
           1       0.86      0.90      0.88        21
           2       0.99      0.95      0.97       371
           3       1.00      0.88      0.94        25

    accuracy                           0.94       519
   macro avg       0.92      0.92      0.92       519
weighted avg       0.95      0.94      0.95       519



### Tunning the ExtraTreesClassifier.

In [71]:
# One of the easiest method to tunn the ExtraTreesClassifier is to change the number of n_estimators.

In [73]:
from sklearn.ensemble import ExtraTreesClassifier

# Build the model
model = ExtraTreesClassifier(n_estimators = 100, random_state = 10)

# Train the model
model.fit(X_train, Y_train)

# Predict using model.
Y_pred = model.predict(X_test)

In [74]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 0), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 1), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0),

### Evaluating Tunned ExtraTreesClassifier model.

In [75]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[101   1   0   0]
 [  3  18   0   0]
 [  1   0 370   0]
 [  1   2   0  22]] 

Accuracy Score =  0.9845857418111753 

Classification Report = 
              precision    recall  f1-score   support

           0       0.95      0.99      0.97       102
           1       0.86      0.86      0.86        21
           2       1.00      1.00      1.00       371
           3       1.00      0.88      0.94        25

    accuracy                           0.98       519
   macro avg       0.95      0.93      0.94       519
weighted avg       0.98      0.98      0.98       519



In [76]:
# Thus from above two classifier we can see that accuracy for both model is :
# 1) Basic ExtraTreesClassifier = 94.41 %
# 2) Tunned ExtraTreesClassifier = 98.45 %

### 2) Building the RandomForestClassifier.

In [77]:
from sklearn.ensemble import RandomForestClassifier

# Build the model
model_forest = RandomForestClassifier(n_estimators = 101, random_state = 10)

# Train the model
model_forest.fit(X_train, Y_train)

# Predict using model
Y_pred = model_forest.predict(X_test)

In [78]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 1), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0),

### Evaluating RandomForestClassifier model

In [79]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred))
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[ 99   3   0   0]
 [  1  19   0   1]
 [  2   0 369   0]
 [  2   0   0  23]]
Accuracy Score =  0.9826589595375722 

Classification Report = 
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       102
           1       0.86      0.90      0.88        21
           2       1.00      0.99      1.00       371
           3       0.96      0.92      0.94        25

    accuracy                           0.98       519
   macro avg       0.94      0.95      0.95       519
weighted avg       0.98      0.98      0.98       519



In [80]:
# Thus from above three classifier we can see that accuracy for all these model is :
# 1) Basic ExtraTreesClassifier  = 94.41 %
# 2) Tunned ExtraTreesClassifier = 98.45 %
# 3) RandomForestClassifier      = 98.26 %

## Employing Boosting Algorithms.

In [81]:
# Boosting Algorithms contain two sub types of it :
# 1) AdaBoostClassifier
# 2) GradientBoostingClassifier

### 1) Building the AdaBoostClassifier model.

In [82]:
from sklearn.ensemble import AdaBoostClassifier

# Build the model
model_AdaBoost = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=10),
                                   n_estimators = 10, random_state = 10)

# Train the model
model_AdaBoost.fit(X_train, Y_train)

# Predict using model
Y_pred = model_AdaBoost.predict(X_test)

In [83]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0),

### Evaluating the AdaBoostClassifier model.

In [84]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion_matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion_matrix = 
[[ 99   2   1   0]
 [  4  17   0   0]
 [  0   0 371   0]
 [  1   0   0  24]] 

Accuracy Score =  0.9845857418111753 

Classification Report = 
              precision    recall  f1-score   support

           0       0.95      0.97      0.96       102
           1       0.89      0.81      0.85        21
           2       1.00      1.00      1.00       371
           3       1.00      0.96      0.98        25

    accuracy                           0.98       519
   macro avg       0.96      0.94      0.95       519
weighted avg       0.98      0.98      0.98       519



In [85]:
# Thus from above four classifier we can see that accuracy for all these model is :
# 1) Basic ExtraTreesClassifier  = 94.41 %
# 2) Tunned ExtraTreesClassifier = 98.45 %
# 3) RandomForestClassifier      = 98.26 %
# 4) AdaBoostClassifier          = 98.45 %

### 2) Building GradientBoosingClassifier model.

In [86]:
from sklearn.ensemble import GradientBoostingClassifier

# Build the model
model_gradient = GradientBoostingClassifier(n_estimators = 150, random_state = 10)

# train the model
model_gradient.fit(X_train, Y_train)

# Predict using the model
Y_pred = model_gradient.predict(X_test)

In [87]:
print(list(zip(Y_test, Y_pred)))

[(2, 2), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (1, 1), (3, 3), (1, 1), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (0, 0), (0, 0), (2, 2), (2, 2), (3, 3), (2, 2), (2, 2), (2, 2), (1, 1), (2, 2), (0, 0), (3, 3), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0), (2, 2), (2, 2), (1, 1), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (2, 2), (0, 0),

### Evaluating GradientBoostingClassifier

In [88]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[101   1   0   0]
 [  0  21   0   0]
 [  0   0 371   0]
 [  0   0   0  25]] 

Accuracy Score =  0.9980732177263969 

Classification Report = 
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       102
           1       0.95      1.00      0.98        21
           2       1.00      1.00      1.00       371
           3       1.00      1.00      1.00        25

    accuracy                           1.00       519
   macro avg       0.99      1.00      0.99       519
weighted avg       1.00      1.00      1.00       519



In [90]:
# Thus from above five classifier we can see that accuracy for all these model is :
# 1) Basic ExtraTreesClassifier  = 94.41 %
# 2) Tunned ExtraTreesClassifier = 98.45 %
# 3) RandomForestClassifier      = 98.26 %
# 4) AdaBoostClassifier          = 98.45 %
# 5) GradientBoostingClassifier  = 99.80 %

# List of Accuracy Score for all the models:
# 1) Basic Decision Tree            = 99.22 %
# 2) Tunned Decision Tree           = 94.21 %
# 3) Basic SVM                      = 85.54 %
# 4) Tunned SVM                     = 99.61 %
# 5) KNN                            = 94.79 %
# 6) Logistic Regression            = 69.55 %   

## Ensemble Modelling.

### Building the Basic Ensemble Model.

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# Build all the sub Models.
estimators = []
model_1 = LogisticRegression()
estimators.append(("Logistic", model_1))
model_2 = DecisionTreeClassifier(criterion = "gini", random_state = 10)
estimators.append(("DecisionTree", model_2))
model_3 = SVC(kernel = "rbf", gamma = 0.1, C = 50)
estimators.append(("SVM", model_3))
model_4 = KNeighborsClassifier(n_neighbors = 8 , metric = "euclidean")
estimators.append(("KNeighbors", model_4))

# Note:
# Here above we have considered the optimum model of a perticular type.
# i.e. for e.g in decision tree basic decision tree gives better accuracy than tunned decision tree
# thus we included only the basic decision tree model
# and in case of SVM the tunned model gives far better accuracy than the basic model
# thus we include only the tunned SVM model.

# Train all the models.
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, Y_train)

# Predict using the model
Y_pred = ensemble.predict(X_test)

### Evaluating the Basic Ensemble Model.

In [92]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[101   1   0   0]
 [  4  17   0   0]
 [  1   0 370   0]
 [  2   0   0  23]] 

Accuracy Score =  0.9845857418111753 

Classification Report = 
              precision    recall  f1-score   support

           0       0.94      0.99      0.96       102
           1       0.94      0.81      0.87        21
           2       1.00      1.00      1.00       371
           3       1.00      0.92      0.96        25

    accuracy                           0.98       519
   macro avg       0.97      0.93      0.95       519
weighted avg       0.99      0.98      0.98       519



In [93]:
# Thus from above we find that if we include all the 4 models into ensemble model it gives
# accuracy = 98.45 %

In [94]:
# Now if we repeat the same procedure but remove the worst model out of those 4.
# i.e. if we remove Logistic Regression which has accuracy = 69.55 % 
# then we will check if the accuracy of ensembled model increases or decreases?

### Ensembled model by removing LogisticRegression.

In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# Build all the sub Models.
estimators = []
#model_1 = LogisticRegression()               # i.e. we have removed the LogisticRegression.
#estimators.append(("Logistic", model_1))
model_2 = DecisionTreeClassifier(criterion = "gini", random_state = 10)
estimators.append(("DecisionTree", model_2))
model_3 = SVC(kernel = "rbf", gamma = 0.1, C = 50)
estimators.append(("SVM", model_3))
model_4 = KNeighborsClassifier(n_neighbors = 8 , metric = "euclidean")
estimators.append(("KNeighbors", model_4))

# Note:
# Here above we have considered the optimum model of a perticular type.
# i.e. for e.g in decision tree basic decision tree gives better accuracy than tunned decision tree
# thus we included only the basic decision tree model
# and in case of SVM the tunned model gives far better accuracy than the basic model
# thus we include only the tunned SVM model.

# Train all the models.
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, Y_train)

# Predict using the model
Y_pred = ensemble.predict(X_test)

In [96]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[101   1   0   0]
 [  2  19   0   0]
 [  0   0 371   0]
 [  1   0   0  24]] 

Accuracy Score =  0.9922928709055877 

Classification Report = 
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       102
           1       0.95      0.90      0.93        21
           2       1.00      1.00      1.00       371
           3       1.00      0.96      0.98        25

    accuracy                           0.99       519
   macro avg       0.98      0.96      0.97       519
weighted avg       0.99      0.99      0.99       519



In [97]:
# Now from above we can say that by removing the logistic Regression 
# ensembled model's accuracy increased to 99.22 % from previous 98.45 %.

In [98]:
# now if we remove KNN which is the second worst model out of those 4 model.

### Ensemble model by removing LogisticRegression & KNN.

In [99]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier

# Build all the sub Models.
estimators = []
#model_1 = LogisticRegression()               # i.e. we have removed the LogisticRegression.
#estimators.append(("Logistic", model_1))
model_2 = DecisionTreeClassifier(criterion = "gini", random_state = 10)
estimators.append(("DecisionTree", model_2))
model_3 = SVC(kernel = "rbf", gamma = 0.1, C = 50)
estimators.append(("SVM", model_3))
#model_4 = KNeighborsClassifier(n_neighbors = 8 , metric = "euclidean")  # i.e. we have removed KNN.
#estimators.append(("KNeighbors", model_4))

# Note:
# Here above we have considered the optimum model of a perticular type.
# i.e. for e.g in decision tree basic decision tree gives better accuracy than tunned decision tree
# thus we included only the basic decision tree model
# and in case of SVM the tunned model gives far better accuracy than the basic model
# thus we include only the tunned SVM model.

# Train all the models.
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, Y_train)

# Predict using the model
Y_pred = ensemble.predict(X_test)

In [100]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

print("Confusion Matrix = ")
print(confusion_matrix(Y_test, Y_pred), "\n")
print("Accuracy Score = ", accuracy_score(Y_test, Y_pred), "\n")
print("Classification Report = ")
print(classification_report(Y_test, Y_pred))

Confusion Matrix = 
[[102   0   0   0]
 [  2  19   0   0]
 [  0   0 371   0]
 [  1   0   0  24]] 

Accuracy Score =  0.9942196531791907 

Classification Report = 
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       102
           1       1.00      0.90      0.95        21
           2       1.00      1.00      1.00       371
           3       1.00      0.96      0.98        25

    accuracy                           0.99       519
   macro avg       0.99      0.97      0.98       519
weighted avg       0.99      0.99      0.99       519



In [101]:
# Now from above we can say that by removing the logistic Regression along with KNN
# ensembled model's accuracy increased to 99.42 % from previous 99.22 %.

## List of Accuracy of every model created.

In [102]:
#  1) Basic Decision Tree            = 99.22 %
#  2) Tunned Decision Tree           = 94.21 %
#  3) Basic SVM                      = 85.54 %
#  4) Tunned SVM                     = 99.61 %
#  5) KNN                            = 94.79 %
#  6) Logistic Regression            = 69.55 %   
#  7) Basic ExtraTreesClassifier     = 94.41 %
#  8) Tunned ExtraTreesClassifier    = 98.45 %
#  9) RandomForestClassifier         = 98.26 %
# 10) AdaBoostClassifier             = 98.45 %
# 11) GradientBoostingClassifier     = 99.80 %
# 12) Basic Ensemble Model           = 98.45 %
# 13) Ensemble model by removing Logistic regression             = 99.22 %
# 14) Ensemble model by removing logistic regression and KNN     = 99.42 %