In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split,
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import PolynomialFeatures
import warnings
warnings.filterwarnings("ignore")
np.random.seed(42)

In [5]:
df=pd.read_csv('balance-scale.data')
#https://archive.ics.uci.edu/ml/machine-learning-databases/balance-scale/balance-scale.names

In [7]:
df.columns=['Class Name','Left-Weight','Left-Distance','Right-Weight','Right-Distance']

In [8]:
df.head()

Unnamed: 0,Class Name,Left-Weight,Left-Distance,Right-Weight,Right-Distance
0,R,1,1,1,2
1,R,1,1,1,3
2,R,1,1,1,4
3,R,1,1,1,5
4,R,1,1,2,1


In [None]:
4. Relevant Information: 
	This data set was generated to model psychological
	experimental results.  Each example is classified as having the
	balance scale tip to the right, tip to the left, or be
	balanced.  The attributes are the left weight, the left
	distance, the right weight, and the right distance.  The
	correct way to find the class is the greater of 
	(left-distance * left-weight) and (right-distance *
	right-weight).  If they are equal, it is balanced.

5. Number of Instances: 625 (49 balanced, 288 left, 288 right)

6. Number of Attributes: 4 (numeric) + class name = 5

7. Attribute Information:
	1. Class Name: 3 (L, B, R)
	2. Left-Weight: 5 (1, 2, 3, 4, 5)
	3. Left-Distance: 5 (1, 2, 3, 4, 5)
	4. Right-Weight: 5 (1, 2, 3, 4, 5)
	5. Right-Distance: 5 (1, 2, 3, 4, 5)

In [11]:
df['Class Name'].value_counts()

R    288
L    288
B     48
Name: Class Name, dtype: int64

In [12]:
df[df['Class Name']=='B']

Unnamed: 0,Class Name,Left-Weight,Left-Distance,Right-Weight,Right-Distance
25,B,1,2,1,2
29,B,1,2,2,1
51,B,1,3,1,3
59,B,1,3,3,1
77,B,1,4,1,4
80,B,1,4,2,2
89,B,1,4,4,1
103,B,1,5,1,5
119,B,1,5,5,1
125,B,2,1,1,2


In [17]:
X=df.drop(columns='Class Name')
y=df['Class Name']

In [18]:
train, test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [19]:
lr = LogisticRegression()
lr.fit(train, y_train)
y_pred = lr.predict(test)
print('Accuracy score baseline:', accuracy_score(y_test, y_pred))

Accuracy score baseline: 0.8829787234042553


In [20]:
dt = DecisionTreeClassifier()
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.8138297872340425


In [21]:
def fit_predict(train, test, y_train, y_test, max_depth, 
                criterion = 'entropy', max_features = 1, min_samples_split = 4):
    dt = DecisionTreeClassifier(criterion = criterion, max_depth=max_depth, 
                                random_state=42, max_features=max_features,
                               min_samples_split=min_samples_split)
    dt.fit(train, y_train)
    y_pred = dt.predict(test)
    print(accuracy_score(y_test, y_pred))

In [22]:
dt = DecisionTreeClassifier(max_depth=22,max_features=0.2,min_samples_split=2)
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.8085106382978723


In [None]:
MAx Depth

In [25]:
for i in range(1,10):
    print('Accuracy score using max_depth =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, i)

Accuracy score using max_depth = 1: 0.6117021276595744
Accuracy score using max_depth = 2: 0.648936170212766
Accuracy score using max_depth = 3: 0.675531914893617
Accuracy score using max_depth = 4: 0.7074468085106383
Accuracy score using max_depth = 5: 0.7978723404255319
Accuracy score using max_depth = 6: 0.7393617021276596
Accuracy score using max_depth = 7: 0.7446808510638298
Accuracy score using max_depth = 8: 0.7819148936170213
Accuracy score using max_depth = 9: 0.7446808510638298


In [None]:
MAXFetature  -- percentage of features 

In [26]:
for i in np.arange(0.1, 1.0, 0.1):
    print('Accuracy score using max features =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, max_depth = 5, max_features=i)

Accuracy score using max features = 0.1: 0.7978723404255319
Accuracy score using max features = 0.2: 0.7978723404255319
Accuracy score using max features = 0.30000000000000004: 0.7978723404255319
Accuracy score using max features = 0.4: 0.7978723404255319
Accuracy score using max features = 0.5: 0.824468085106383
Accuracy score using max features = 0.6: 0.824468085106383
Accuracy score using max features = 0.7000000000000001: 0.824468085106383
Accuracy score using max features = 0.8: 0.7872340425531915
Accuracy score using max features = 0.9: 0.7872340425531915


In [None]:
#MinSampleSplit --  no of records to require to split the node Example if minmum of 2 sample then only split happen

In [28]:
for i in range(2, 10):
    print('Accuracy score using min samples split =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, 5, max_features=0.5, min_samples_split=i)

Accuracy score using min samples split = 2: 0.824468085106383
Accuracy score using min samples split = 3: 0.824468085106383
Accuracy score using min samples split = 4: 0.824468085106383
Accuracy score using min samples split = 5: 0.824468085106383
Accuracy score using min samples split = 6: 0.824468085106383
Accuracy score using min samples split = 7: 0.824468085106383
Accuracy score using min samples split = 8: 0.824468085106383
Accuracy score using min samples split = 9: 0.824468085106383


In [None]:
#Critertion 

In [29]:
for i in ['gini', 'entropy']:
    print('Accuracy score using criterion =', i, end = ': ')
    fit_predict(train, test, y_train, y_test, 5, 
                max_features=0.5, min_samples_split=4, criterion = i)

Accuracy score using criterion = gini: 0.8404255319148937
Accuracy score using criterion = entropy: 0.824468085106383


In [30]:
dt = DecisionTreeClassifier(max_depth=5,criterion ='gini',max_features=0.5,min_samples_split=4)
dt.fit(train, y_train)
y_pred = dt.predict(test)
print(accuracy_score(y_test, y_pred))

0.8138297872340425
