# Import statements

In [2]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import random


#takes care that out plots are shown in this notebook only
%matplotlib inline 

#takes care that the tree that we create is easy to read and understand
from pprint import pprint

# Load and Prepare Data

In [3]:
from sklearn import datasets

iris = datasets.load_iris()

df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", "pl", "pw"]

In [4]:
df.head()

Unnamed: 0,sl,sw,pl,pw
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [105]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [106]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df.head()

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a


In [1]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)
df.head()

NameError: name 'df' is not defined

# Train - test split

In [108]:
def train_test_split(df, test_size):
    #this part is checking if the test_size given is a proportion or not.
    if isinstance(test_size, float):
        test_size = round(test_size * len(df))

    indices = df.index.tolist()
    #in sample, population works on a list, thats why we created indices list
    #k is another argument which is number of elements that we want to sample from this population.

    test_indices = random.sample(population = indices, k = test_size)

    #.loc attribute allows us to access only certain rows.

    test_df = df.loc[test_indices]
    train_df = df.drop(test_indices)
    return train_df, test_df

In [109]:
#to make sure we get same set of train test data, we can use random.seed

random.seed(1)
train_df, test_df = train_test_split(df, test_size = 0.1)

In [110]:
test_df.head()

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
34,4.9,3.1,1.5,0.1,a,c,a,a
145,6.7,3.0,5.2,2.3,c,b,c,d
16,5.4,3.9,1.3,0.4,b,d,a,a
65,6.7,3.1,4.4,1.4,c,c,c,c
30,4.8,3.1,1.6,0.2,a,c,a,a


# Helper Functions

In [111]:
data = train_df.values
data[:5]

array([[5.1, 3.5, 1.4, 0.2, 'b', 'c', 'a', 'a'],
       [4.9, 3.0, 1.4, 0.2, 'a', 'b', 'a', 'a'],
       [4.7, 3.2, 1.3, 0.2, 'a', 'c', 'a', 'a'],
       [4.6, 3.1, 1.5, 0.2, 'a', 'c', 'a', 'a'],
       [5.0, 3.6, 1.4, 0.2, 'a', 'c', 'a', 'a']], dtype=object)

#### Is data Pure or not

In [112]:
#giving it a numpy 2d array as input
def check_purity(data):
    for i in range(len(df.columns)):
        label_column = data[:, i]
        unique_classes = (np.unique(label_column))

        if len(unique_classes) == 1:
            return True
        else:
            return False


In [116]:
# doubt

check_purity(train_df[train_df.pw < 0.8].values)

False

#### Classify

In [65]:
def classify_data(data):
    
    return classification

In [119]:
for i in range(len(df.columns)):
    label_column = data[:, i]
#     #this function only runs when the data is pure. So, we pick simply one element from label_column
    
    
#     classification = label_column[0]
    
    print(np.unique(label_column, return_counts = True))



(array([4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5.0, 5.1, 5.2, 5.3, 5.4, 5.5,
       5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7, 6.8,
       6.9, 7.0, 7.1, 7.2, 7.3, 7.4, 7.6, 7.7, 7.9], dtype=object), array([1, 3, 1, 4, 2, 3, 5, 9, 9, 4, 1, 5, 6, 6, 7, 7, 3, 6, 6, 2, 9, 6, 4,
       2, 5, 3, 3, 1, 1, 3, 1, 1, 1, 4, 1], dtype=int64))
(array([2.0, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3,
       3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.4], dtype=object), array([ 1,  3,  3,  3,  8,  5,  9, 12,  9, 25,  9, 10,  5, 10,  6,  3,  3,
        6,  1,  1,  1,  1,  1], dtype=int64))
(array([1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.9, 3.0, 3.3, 3.5, 3.6,
       3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9,
       5.0, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6.0, 6.1, 6.3,
       6.4, 6.6, 6.7, 6.9], dtype=object), array([ 1,  1,  2,  6, 12, 12,  6,  4,  1,  1,  2,  2,  1,  1,  1,  3,  4,
        2,  4,  1,  3,  8,  3,  5,  3,  5,  4,  7, 