## Prepare Data 
Import pandas and edit the dataframe

In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('pima-indians-diabetes.csv') # read from directory
data = data.dropna(axis=0)

In [4]:
data.describe()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50,1
count,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0,767.0
mean,3.842243,120.859192,69.101695,20.517601,79.90352,31.990482,0.471674,33.219035,0.34811
std,3.370877,31.978468,19.368155,15.954059,115.283105,7.889091,0.331497,11.752296,0.476682
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.2435,24.0,0.0
50%,3.0,117.0,72.0,23.0,32.0,32.0,0.371,29.0,0.0
75%,6.0,140.0,80.0,32.0,127.5,36.6,0.625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
data.columns

Index(['6', '148', '72', '35', '0', '33.6', '0.627', '50', '1'], dtype='object')

In [6]:
y_column = ['1']
y = data[y_column] # could also do data.columnName but it's a number so not this time

In [7]:
x_columns = ['6', '148', '72', '35', '0', '33.6', '0.627', '50']
X = data[x_columns]

In [8]:
X.head()

Unnamed: 0,6,148,72,35,0,33.6,0.627,50
0,1,85,66,29,0,26.6,0.351,31
1,8,183,64,0,0,23.3,0.672,32
2,1,89,66,23,94,28.1,0.167,21
3,0,137,40,35,168,43.1,2.288,33
4,5,116,74,0,0,25.6,0.201,30


## Train test split

In [9]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, random_state=256)

In [11]:
# Check for missing
missing_count = (train_X.isnull().sum())
print(missing_count[missing_count > 0])

Series([], dtype: int64)


In [14]:
(10000*100)-100

999900

## Simple Tree Model

In [12]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=256)

In [13]:
model.fit(train_X, train_y) # fit model

DecisionTreeRegressor(random_state=256)

## Mean Absolute Error
How much we are off by for each prediction

In [14]:
from sklearn.metrics import mean_absolute_error
predictions = model.predict(X)
mae = mean_absolute_error(y, predictions)
print(mae)

0.09126466753585398


In [20]:
# Predict
print("Making predictions for the following 5 features:")
print(X.head())
print("\nThe predictions are")
print(model.predict(X.head()))

Making predictions for the following 5 features:
   6  148  72  35    0  33.6  0.627  50
0  1   85  66  29    0  26.6  0.351  31
1  8  183  64   0    0  23.3  0.672  32
2  1   89  66  23   94  28.1  0.167  21
3  0  137  40  35  168  43.1  2.288  33
4  5  116  74   0    0  25.6  0.201  30

The predictions are
[0. 1. 0. 1. 0.]


In [21]:
print("The actual values for the following 5 features:")
print(y.head())

The actual values for the following 5 features:
   1
0  0
1  1
2  0
3  1
4  0


## Controlling the # of leaf nodes

In [26]:
from sklearn.tree import DecisionTreeRegressor

# Create a function to return the mean absolute error
def find_mae(max_leaf_nodes, train_X, test_X, train_y, test_y):
    model = DecisionTreeRegressor(max_leaf_nodes = max_leaf_nodes, random_state=256)
    model.fit(train_X, train_y)
    
    predictions = model.predict(test_X)
    mae = mean_absolute_error(test_y, predictions)
    return mae

In [30]:
for max_leaf_nodes in [5, 50, 500, 5000]:
    mae = find_mae(max_leaf_nodes, train_X, test_X, train_y, test_y)
    print("Max leaf nodes: %d  \t\t Mean Absolute Error: %f" %(max_leaf_nodes, mae))

Max leaf nodes: 5  		 Mean Absolute Error: 0.359732
Max leaf nodes: 50  		 Mean Absolute Error: 0.325220
Max leaf nodes: 500  		 Mean Absolute Error: 0.338542
Max leaf nodes: 5000  		 Mean Absolute Error: 0.338542


Since 50 nodes has the least error, that's what I'll go with in the final tree

### Final Model

In [31]:
final_model = DecisionTreeRegressor(max_leaf_nodes=50, random_state=256)

In [33]:
final_model.fit(X, y) # Use all avaliable data

DecisionTreeRegressor(max_leaf_nodes=50, random_state=256)

In [36]:
predictions = model.predict(test_X)
mae = mean_absolute_error(test_y, predictions)

In [37]:
print(mae)

0.3645833333333333


# Iris Identification

In [23]:
import pandas as pd

In [38]:
iris = pd.read_csv('Iris.csv')

In [39]:
iris = iris.dropna(axis=0)

In [47]:
print(iris.describe())

               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000


In [40]:
iris.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [41]:
iris.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [58]:
species = iris.Species
y = []  # change to numbers
for s in species:
    if s == "Iris-setosa": y.append(0)
    elif s == "Iris-versicolor": y.append(1)
    else: y.append(2)

In [60]:
y[:5]

[0, 0, 0, 0, 0]

In [50]:
iris_features = ['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
X = iris[iris_features]

In [51]:
X.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,1,5.1,3.5,1.4,0.2
1,2,4.9,3.0,1.4,0.2
2,3,4.7,3.2,1.3,0.2
3,4,4.6,3.1,1.5,0.2
4,5,5.0,3.6,1.4,0.2


In [62]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=256)

In [64]:
from sklearn.tree import DecisionTreeRegressor
iris_model = DecisionTreeRegressor(random_state=256)
iris_model.fit(X_train, y_train)

DecisionTreeRegressor(random_state=256)

In [66]:
from sklearn.metrics import mean_absolute_error
predictions = iris_model.predict(X_test)
mae = mean_absolute_error(y_test, predictions)

In [67]:
print(mae)

0.02631578947368421


In [69]:
# Predict
print("Making predictions for the following 5 features:")
print(X.head())
print("\nThe predictions are")
print(iris_model.predict(X.head()))

Making predictions for the following 5 features:
   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
0   1            5.1           3.5            1.4           0.2
1   2            4.9           3.0            1.4           0.2
2   3            4.7           3.2            1.3           0.2
3   4            4.6           3.1            1.5           0.2
4   5            5.0           3.6            1.4           0.2

The predictions are
[0. 0. 0. 0. 0.]


In [71]:
print("The actual values for the following 5 features:")
print(y[:5])

The actual values for the following 5 features:
[0, 0, 0, 0, 0]


In [77]:
iris_model = DecisionTreeRegressor(max_leaf_nodes=100, random_state=256)

In [78]:
iris_model.fit(X_train, y_train)

DecisionTreeRegressor(max_leaf_nodes=100, random_state=256)

In [79]:
predictions = iris_model.predict(X_test)
mae = mean_absolute_error(predictions, y_test) # order of arguments doesn't matter
print(mae)

0.02631578947368421
