In [53]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Create train test data set

In [85]:
dfColor = pd.read_csv("../data/clean/Colours.csv")
dfColor.head()

Unnamed: 0,Name0,Desc1,Desc2,Name1,HEX,R,G,B,Hue,Sat,Lum
0,black,Black,Black,Black,#000000,0,0,0,0.0,0.0,0.0
1,red,medium dark red,Dark red,Deep maroon,#820000,130,0,0,0.0,100.0,25.5
2,red,medium dark red,Dark red,Deep red,#850101,133,1,1,0.0,98.5,26.3
3,red,medium dark red,Dark red,Blood,#8a0303,138,3,3,0.0,95.7,27.6
4,red,medium dark red,Dark red,Darkred,#8B0000,139,0,0,0.0,100.0,27.3


In [86]:
dfColor['index'] = dfColor['HEX'].apply(lambda x: int(x.lstrip('#'),16))
dfColor.set_index('index', inplace=True, verify_integrity=True)
dfColor.head()

Unnamed: 0_level_0,Name0,Desc1,Desc2,Name1,HEX,R,G,B,Hue,Sat,Lum
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,black,Black,Black,Black,#000000,0,0,0,0.0,0.0,0.0
8519680,red,medium dark red,Dark red,Deep maroon,#820000,130,0,0,0.0,100.0,25.5
8716545,red,medium dark red,Dark red,Deep red,#850101,133,1,1,0.0,98.5,26.3
9044739,red,medium dark red,Dark red,Blood,#8a0303,138,3,3,0.0,95.7,27.6
9109504,red,medium dark red,Dark red,Darkred,#8B0000,139,0,0,0.0,100.0,27.3


In [29]:
dfColor.columns

Index(['Name0', 'Desc1', 'Desc2', 'Name1', 'HEX', 'R', 'G', 'B', 'Hue', 'Sat',
       'Lum', 'index'],
      dtype='object')

In [56]:
dfColor.drop(['Desc1', 'Desc2', 'Name1', 'HEX', 'R', 'G', 'B'], axis=1)

Unnamed: 0_level_0,Name0,Hue,Sat,Lum
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,black,0.0,0.0,0.0
8519680,red,0.0,100.0,25.5
8716545,red,0.0,98.5,26.3
9044739,red,0.0,95.7,27.6
9109504,red,0.0,100.0,27.3


In [117]:
train_set, test_set = train_test_split(dfColor, test_size=0.2, random_state=42)

In [76]:
dfColor['Name0'].value_counts()

brown     391
purple    319
grey      308
pink      295
blue      261
green     251
red       155
yellow    117
orange    107
black      62
cyan       48
white      36
Name: Name0, dtype: int64

In [77]:
train_set['Name0'].value_counts()

brown     304
purple    259
grey      247
pink      242
blue      205
green     204
red       122
yellow     97
orange     81
black      51
cyan       41
white      27
Name: Name0, dtype: int64

In [78]:
test_set['Name0'].value_counts()

brown     87
grey      61
purple    60
blue      56
pink      53
green     47
red       33
orange    26
yellow    20
black     11
white      9
cyan       7
Name: Name0, dtype: int64

In [80]:
train_set.to_csv("../data/interim/train.csv")
test_set.to_csv("../data/interim/test.csv")

## Categorical Attributes

In [43]:
encoder = LabelEncoder()
colour_cat = dfColor['Name0']
colour_encoded = encoder.fit_transform(colour_cat)
colour_encoded

array([0, 9, 9, ..., 9, 9, 9])

In [130]:
print(encoder.classes_)

['black' 'blue' 'brown' 'cyan' 'green' 'grey' 'orange' 'pink' 'purple'
 'red' 'white' 'yellow']


### OneHotEncoder

In [45]:
from sklearn.preprocessing import OneHotEncoder

In [46]:
encoder = OneHotEncoder()
colour_1hot = encoder.fit_transform(colour_encoded.reshape(-1,1))
colour_1hot.toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [7]:
from sklearn.preprocessing import LabelBinarizer

In [8]:
encoder = LabelBinarizer(sparse_output=True)
colour_1hot = encoder.fit_transform(colour_cat)
colour_1hot

<2351x12 sparse matrix of type '<class 'numpy.int64'>'
	with 2351 stored elements in Compressed Sparse Row format>

In [125]:
encoder = LabelEncoder()
colour_cat = train_set['Name0']
colour_encoded = encoder.fit_transform(colour_cat)
colour_encoded

array([7, 2, 9, ..., 4, 1, 4])

## Custom Transformer

In [67]:
from sklearn.base import BaseEstimator, TransformerMixin

In [82]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [114]:
class CustomLabelBinarizer(BaseEstimator, TransformerMixin):
    def __init__(self, sparse_output=False):
        self.sparse_output = sparse_output
    def fit(self, X, y=None):
        self.enc = LabelBinarizer(sparse_output=self.sparse_output)
        self.enc.fit(X)
        return self
    def transform(self, X, y=None):
        return self.enc.transform(X)

## Feature Scaling

In [74]:
from sklearn import impute

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion

In [135]:
num_attribs = ['Hue', 'Sat', 'Lum']
cat_attribs = ['Name0']

In [137]:
num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
    ('imputer', impute.SimpleImputer(strategy="median")),
    ('std_scaler', StandardScaler()),
])

cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('label_binerizer', CustomLabelBinarizer()),
])

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
#    ("cat_pipeline", cat_pipeline),
])

In [138]:
colour_prepared = full_pipeline.fit_transform(train_set)
colour_prepared

array([[ 1.15946241,  1.1321029 ,  0.3771856 ],
       [-1.0435037 ,  1.1321029 ,  2.01112291],
       [-1.18512295,  0.79481375, -0.51676187],
       ...,
       [-0.03643348, -1.33370997, -0.71541686],
       [ 0.22320181,  1.1321029 , -0.99353385],
       [-0.79960388,  0.77189119, -0.293275  ]])

In [139]:
col_labels = train_set['Name0']

# Linear Regression

In [141]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [143]:
tree_reg = DecisionTreeRegressor()
tree_reg.fit(colour_prepared, colour_encoded)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [145]:
some_data = train_set.iloc[:5]
some_labels = colour_cat.iloc[:5]
some_data_prepared = full_pipeline.transform(some_data)
print("Predictions:\t", tree_reg.predict(some_data_prepared))
print("Labels:\t", list(some_labels))

Predictions:	 [7. 2. 9. 4. 5.]
Labels:	 ['pink', 'brown', 'red', 'green', 'grey']


In [146]:
encoder.classes_

array(['black', 'blue', 'brown', 'cyan', 'green', 'grey', 'orange',
       'pink', 'purple', 'red', 'white', 'yellow'], dtype=object)

In [147]:
col_predictions = tree_reg.predict(colour_prepared)
lin_mse = mean_squared_error(colour_encoded, col_predictions)
lin_rmse = np.sqrt(lin_mse)
lin_rmse

0.03261640365267211

In [148]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(tree_reg, colour_prepared, colour_encoded, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [149]:
def display_scores(scores):
    print("Scores", scores)
    print("Mean", scores.mean())
    print("Standard deviation:", scores.std())

In [150]:
display_scores(rmse_scores)

Scores [2.06155281 1.57270603 1.83928756 1.54713187 1.85369097 1.89062637
 1.7351191  1.67109341 1.69793366 1.5911992 ]
Mean 1.7460340990838983
Standard deviation: 0.15568322087435096


# Random Forest

In [151]:
from sklearn.ensemble import RandomForestClassifier

In [154]:
forrest_classifier = RandomForestClassifier()
forrest_classifier.fit(colour_prepared, col_labels)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [155]:
scores = cross_val_score(tree_reg, colour_prepared, colour_encoded, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

In [156]:
display_scores(rmse_scores)

Scores [2.00928695 1.67268417 1.66311679 1.5088392  1.67427342 2.06541944
 1.70418758 1.67903214 1.73205081 1.54368997]
Mean 1.7252580473324517
Standard deviation: 0.16971687909806307
