### Imports 

In [1]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
import pandas as pd

### Load & Prep Data

The <a href='https://homepages.inf.ed.ac.uk/imurray2/teaching/oranges_and_lemons/'>Fruits Dataset</a> was originally created by Dr. Iain Murray from University of Edinburgh and extended more recently by the University of Michigan. It is a simple multi-class dataset with 4 columns (features) and 4 classes (fruits). The 4 classes are apple, orange, mandarin and lemon. The four features are mass, width, height and color score of the fruit.

The color score feature maps to a color and its intensity in the color spectrum (0 - 1) scale. <br><br>
<table align="left" style="width:50%">
    <tr>
        <th>Color</th>
        <th>Range</th>
    </tr>
    <tr>
        <td>Red</td>
        <td>0.85 - 1.00</td>
    </tr>
    <tr>
        <td>Orange</td>
        <td>0.75 - 0.85</td>
    </tr>
    <tr>
        <td>Yellow</td>
        <td>0.65 - 0.75</td>
    </tr>
    <tr>
        <td>Green</td>
        <td>0.45 - 0.65</td>
    </tr>
</table>

In [2]:
df = pd.read_csv('./DATA/fruits.csv', names=['class', 'mass', 'width', 'height', 'color_score'])

In [3]:
df.head()

Unnamed: 0,class,mass,width,height,color_score
0,apple,192,8.4,7.3,0.55
1,apple,180,8.0,6.8,0.59
2,apple,176,7.4,7.2,0.6
3,mandarin,86,6.2,4.7,0.8
4,mandarin,84,6.0,4.6,0.79


In [4]:
df.shape

(59, 5)

In [5]:
df['class'].unique().tolist()

['apple', 'mandarin', 'orange', 'lemon']

In [6]:
X = df[['mass', 'width', 'height', 'color_score']]
y = df['class']

##### Encode the classes into numerical values using Sklearn's LabelEncoder

In [7]:
label_encoder = LabelEncoder()
label_encoder.fit(['apple', 'orange', 'mandarin', 'lemon'])
y = label_encoder.transform(y)

In [8]:
y

array([0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

##### Split X, y into train and test sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)

In [10]:
X_train.shape

(44, 4)

In [11]:
X_test.shape

(15, 4)

##### Scale feature columns using Sklearn's MinMaxScaler

In [12]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [13]:
X_train[0]

array([0.32142857, 0.32352941, 0.50769231, 1.        ])

In [14]:
y_train[0]

0

##### Combine Scaled X & y into Train and Test DataFrames 

In [15]:
X_train = pd.DataFrame(X_train, columns=['mass', 'width', 'height', 'color_score'])
y_train = pd.DataFrame(y_train, columns=['class'])
train_df = pd.concat([y_train, X_train], axis=1)
train_df.head()

Unnamed: 0,class,mass,width,height,color_score
0,0,0.321429,0.323529,0.507692,1.0
1,1,0.421429,0.411765,0.969231,0.323529
2,0,0.364286,0.382353,0.584615,0.970588
3,0,0.278571,0.352941,0.476923,0.852941
4,1,0.192857,0.058824,0.646154,0.352941


In [16]:
X_test = pd.DataFrame(X_test, columns=['mass', 'width', 'height', 'color_score'])
y_test = pd.DataFrame(y_test, columns=['class'])
test_df = pd.concat([y_test, X_test], axis=1)
test_df.head()

Unnamed: 0,class,mass,width,height,color_score
0,1,0.142857,0.058824,0.538462,0.382353
1,3,0.371429,0.529412,0.646154,0.588235
2,0,0.314286,0.441176,0.569231,0.323529
3,1,0.157143,0.058824,0.676923,0.441176
4,3,0.457143,0.5,0.8,0.529412


##### Create a DataFrame for Batch Inference without the Class column

In [17]:
batch_test_df = test_df.drop(['class'], axis=1)
batch_test_df.head()

Unnamed: 0,mass,width,height,color_score
0,0.142857,0.058824,0.538462,0.382353
1,0.371429,0.529412,0.646154,0.588235
2,0.314286,0.441176,0.569231,0.323529
3,0.157143,0.058824,0.676923,0.441176
4,0.457143,0.5,0.8,0.529412


##### Write Train & Test Sets to Local Directory

In [18]:
train_df.to_csv('./DATA/train.csv', header=False, index=False)
test_df.to_csv('./DATA/test.csv', header=False, index=False)
batch_test_df.to_csv('./DATA/batch_test.csv', header=False, index=False)