In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# ___An Example Machine Learning Problem___
----------------

In [2]:
# Let's build a very simple fruit recognition model.
# This fruits data comes from Dr. Ian Murry from University of Edinburgh.

In [6]:
fruits = pd.read_csv(r"../resources/assets/fruit_data_with_colors.txt", delimiter = "\t")

In [14]:
fruits.head()

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [15]:
# no nans
fruits.isna().sum(axis = 0)

fruit_label      0
fruit_name       0
fruit_subtype    0
mass             0
width            0
height           0
color_score      0
dtype: int64

In [19]:
# Types of fruits in the dataset.
fruits.fruit_name.unique()

array(['apple', 'mandarin', 'orange', 'lemon'], dtype=object)

In [20]:
# look at the unique subtypes.
fruits.fruit_subtype.unique()

array(['granny_smith', 'mandarin', 'braeburn', 'golden_delicious',
       'cripps_pink', 'spanish_jumbo', 'selected_seconds', 'turkey_navel',
       'spanish_belsan', 'unknown'], dtype=object)

In [23]:
# Features in the dataset.
fruits.columns

Index(['fruit_label', 'fruit_name', 'fruit_subtype', 'mass', 'width', 'height',
       'color_score'],
      dtype='object')

In [26]:
fruits.loc[:10, ["fruit_label", "fruit_name", "fruit_subtype"]]

Unnamed: 0,fruit_label,fruit_name,fruit_subtype
0,1,apple,granny_smith
1,1,apple,granny_smith
2,1,apple,granny_smith
3,2,mandarin,mandarin
4,2,mandarin,mandarin
5,2,mandarin,mandarin
6,2,mandarin,mandarin
7,2,mandarin,mandarin
8,1,apple,braeburn
9,1,apple,braeburn


In [27]:
# Each record in the table corresponds to an object.
# Each column corresponds to a fruit feature.
# Fruit label is a categorical nominal numerical representation of the fruit name feature.
# Colour score column is a single real number that gives the fruit's colour an oversimplified numeric score in the range of 0 to 1.0
    # red - 0.85 - 1.00
    # orange 0.75 - 0.85
    # yellow 0.65 - 0.75
    # green 0.45 - 0.65

# The records that has "unknown" in the fruit_subtype column are the contenders for the test dataset.

### ___&darr; This is an alternative to `train_test_split`, since we have to split the dataset based on a specific criteria.___

In [30]:
train = fruits.query("fruit_subtype != 'unknown'")
test = fruits.query("fruit_subtype == 'unknown'")

In [35]:
train.shape, test.shape

((49, 7), (10, 7))

In [36]:
# We have 49 records in the training dataset and 10 records in the test dataset.