In [5]:
from sklearn import set_config
from numpy.random import seed
from tensorflow import keras
from keras import backend as K, callbacks, layers
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler, PowerTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.pipeline import Pipeline

from sklearn.base import TransformerMixin, BaseEstimator
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import tensorflow as tf
import numpy as np

print("Tensorflow version: " + tf.__version__)
plt.rc('figure', autolayout=True)
plt.rc('axes', labelweight='bold', labelsize='large',
       titleweight='bold', titlesize=14, titlepad=10)
%matplotlib inline
warnings.filterwarnings('ignore')
seed(42)
tf.random.set_seed(42)
set_config(display='diagram')


AttributeError: partially initialized module 'pandas' has no attribute 'core' (most likely due to a circular import)

Data input


In [None]:
df = pd.read_csv('input/train.csv', sep = ';')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.isnull().values.any()

In [None]:
def diagnostic_plots(df, variable,target):
    # The function takes a dataframe (df) and
    # the variable of interest as arguments.

    # Define figure size.
    plt.figure(figsize=(15, 4))

    # histogram
    plt.subplot(1, 4, 1)
    sns.histplot(df[variable], bins=30,color = 'r')
    plt.title('Histogram')


    # scatterplot
    plt.subplot(1, 4, 2)
    plt.scatter(df[variable],df[target],color = 'g')
    plt.title('Scatterplot')
    
    
    # boxplot
    plt.subplot(1, 4, 3)
    sns.boxplot(y=df[variable],color = 'b')
    plt.title('Boxplot')
    
    # barplot
    plt.subplot(1, 4, 4)
    sns.barplot(x = target, y = variable, data = df)   
    plt.title('Barplot')
    
    
    plt.show()
for variable in df.drop('type', axis = 1):
    diagnostic_plots(df,variable,'quality')

In [None]:
df.skew(axis = 0)

In [None]:
df.kurtosis(axis = 0)

In [None]:
sns.catplot(x='quality', col = 'type', kind='count', data=df, palette = 'magma')


In [None]:
df.hist()

In [None]:
plt.figure(figsize = (10,8))
sns.set(font_scale=1.25)
sns.heatmap(df.corr(), cmap = 'YlGnBu', cbar_kws = {'shrink': 1}, annot = True, cbar = True, fmt = '.2f', annot_kws={'size': 10}, square = True) 

## Preprocessing


Both ColumnTransformer and Pipeline objects are simple transformer so they can be put inside each other. Thus can be loop further as neeeded

In [None]:
# The below code is creating a pipeline for each of the transformers.
transformer_cat = make_pipeline(OneHotEncoder(handle_unknown='ignore'))
transformer_minmax = make_pipeline(MinMaxScaler(feature_range=(0, 1)))
transformer_std = make_pipeline(StandardScaler())
transformer_yeoj = make_pipeline(PowerTransformer(method='yeo-johnson', standardize=True))

In [None]:
'''features = df.columns.get_indexer(df.drop('type', axis = 1).columns.values).tolist()
features_cat = df.columns.get_indexer(['type']).tolist()
features_minmax = df.drop(['type', 'quality'], axis = 1).columns.values.tolist()
features_std = df.columns.get_indexer(['citric acid', 'total sulfur dioxide', 'density', 'pH']).tolist()
# get the index of the columns type and quality
a = df.columns.get_indexer(['type', 'quality'])
# get the index of the columns that are in both a and features std
b = np.concatenate((features_std, a))
# change ndarray to list
c = b.tolist()
# get the index of the columns that are not in c
d = df.drop(df.columns[c], axis = 1).columns.values
features_yeoj = df.columns.get_indexer(d).tolist()
features_minmax = df.columns.get_indexer(features_minmax).tolist()'''
features = list(df.drop('type', axis = 1).columns.values)
features_cat = ['type']
features_std = ['citric acid', 'total sulfur dioxide', 'density', 'pH']
features_yeoj = ['fixed acidity', 'volatile acidity', 'residual sugar', 'chlorides', 'free sulfur dioxide', 'sulphates', 'alcohol']
features_minmax = df.drop(['type', 'quality'], axis = 1).columns.values.tolist()
print(features)
print(features_cat)
print(features_minmax)
print(features_std)
print(features_yeoj)

In [None]:
# Creating a pipeline that first transforms the data using the cols_transformer, then scales the data using the minmax scaler.
cat = make_column_transformer(
    (transformer_cat, features_cat),
    remainder='passthrough'
)

std = make_column_transformer(
    (transformer_std, features_std),
    remainder='passthrough'
)

yeoj = make_column_transformer(
    (transformer_yeoj, features_yeoj),
    remainder='passthrough'
)

minmax = make_column_transformer(
    (transformer_minmax, [i for i in range(10)]),
    remainder='passthrough'
)

cols_transformer = make_column_transformer(
    (transformer_yeoj, features_yeoj),
    (transformer_std, features_std), 
    (transformer_cat, features_cat)
)

class Debug(BaseEstimator, TransformerMixin):

    def transform(self, X):
        print(X.shape)
        self.shape = X.shape
        # what other output you want
        print(X[0])
        return X

    def fit(self, X, y=None, **fit_params):
        return self
    
pipe = Pipeline(
    steps = [
        ("cols transformer", cols_transformer),
        ("min max scaler", minmax),
    ]
)

'''12 0 1 2 3 4 5 6 7 8 9 10 11
2 6 7 8 12 0 1 3 4 5 9 10 11
0 1 3 4 5 9 10 11 2 6 7 8 12 
'''



In [None]:
data = pd.DataFrame(pipe.fit_transform(df))

In [None]:
data.head()

In [None]:
data.columns = features_std + features_yeoj + ['red', 'white']
quality = df['quality']
# rearrange columns and add quality and type columns to the dataframe
data = pd.concat([data[df.drop(['type', 'quality'], axis = 1).columns.values.tolist()], 
                  quality, data.iloc[:,11:]], axis = 1)
data.head()

In [None]:
for variable in data.drop(['red', 'white'], axis = 1):
    diagnostic_plots(data,variable,'quality')

In [None]:
data.skew(axis = 0)

In [None]:
data.kurtosis(axis = 0)

In [None]:
def split(data):
    """_summary_ : Split the data into train, validation and test sets.

    Args:
        data (DataFrame): Data to split

    Returns:
        DataFrame: train, validation and test sets  
    """
    train, val_and_test = train_test_split(data, test_size = 0.2, random_state=42)
    val, test = train_test_split(val_and_test, test_size = 0.5, random_state=42)
    return train, val, test

In [None]:
train, val, test = split(df)

In [None]:
train.shape, val.shape, test.shape

In [None]:
def split_target(data):
    """_summary_ : Splitting the data into features and target.

    Args:
        data (_type_): data to split

    Returns:
        _type_: features and target
    """
    # Splitting the data into features and target.
    X, y = data.drop('quality', axis = 1), data['quality']
    return X, y

def transform_features(X):
    """_summary_ : Transforming the features using the preprocessor.

    Args:
        X (_type_): features to transform

    Returns:
        _type_: transformed features
    """
    X_transform = pd.DataFrame(pipe.fit_transform(X))
    X_transform.columns = features_minmax + ['red', 'white']
    return X_transform 

def split_transform(data):
    '''Splitting the data into features and target, and then transforming the features.'''
    X, y = split_target(data)
    return transform_features(X), y

X_train, y_train = split_transform(train)
X_val, y_val = split_transform(val)

X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [None]:
X_train.head()


In [None]:
X_val.head()

# Define model


In [None]:
from keras.layers import (
    Dense, Dropout, BatchNormalization
)
def build_model(hp):
    """_summary_ : Building a model using the hyperparameters.

    Args:
        hp (_type_): _description_

    Returns:
        _type_: _description_
    """
    # Creating a placeholder for the input data.
    inputs = keras.Input(shape=(13,))
    # Creating a dense layer with the parameters specified in the function.
    x = Dense(units = hp.Int('units', min_value = 256, max_value = 1024, step = 32), activation = 'relu')(inputs)
    x = Dense(units = hp.Int('units', min_value = 256, max_value = 1024, step = 32), activation = 'relu')(x)
    x = Dropout(hp.Float('dropout', min_value = 0.0, max_value = 0.5, step = 0.1))(x)
    x = BatchNormalization()(x)
    x = Dense(units = hp.Int('units', min_value = 256, max_value = 1024, step = 32), activation = 'relu')(x)
    x = Dropout(hp.Float('dropout', min_value = 0.0, max_value = 0.5, step = 0.1))(x)
    x = BatchNormalization()(x)
    x = Dense(units = hp.Int('units', min_value = 256, max_value = 1024, step = 32), activation = 'relu')(x)
    x = Dropout(hp.Float('dropout', min_value = 0.0, max_value = 0.5, step = 0.1))(x)
    x = BatchNormalization()(x)
    x = Dense(units = hp.Int('units', min_value = 256, max_value = 1024, step = 32), activation = 'relu')(x)
    x = Dropout(hp.Float('dropout', min_value = 0.0, max_value = 0.5, step = 0.1))(x)                                                          
    # Creating a dense layer with 1 unit.
    outputs = layers.Dense(1)(x)
    # Creating a model with the input and output layers.
    model = keras.Model(inputs, outputs)
    # Compile model
    model.compile(
        optimizer = keras.optimizers.Adam(
            hp.Float("lr", min_value = 1e-4, max_value = 1e-2, sampling = "log")
        ),
        loss = 'mse',
        metrics = [tf.keras.metrics.RootMeanSquaredError()],
    )
    return model

## Callbacks

In [None]:
early_stopping = callbacks.EarlyStopping(
    monitor = 'loss',
    patience = 10,
    min_delta = 0.001,
    restore_best_weights = True,
    verbose = 1
)

lr_schedule = callbacks.ReduceLROnPlateau(
    monitor = 'loss',
    patience = 5,
    factor = 0.2,
    min_lr = 0.001,
    verbose = 1
)

In [None]:
from keras.callbacks import TensorBoard
tensorboard = TensorBoard(log_dir = './logs')

# Hyperband

In [None]:
import keras_tuner as kt
from kerastuner.tuners import Hyperband

tuner = Hyperband(
    build_model,
    objective='val_loss',
    max_epochs=1000,
    executions_per_trial=3,
    overwrite=True,
    directory='my_dir',
    project_name = 'wine_quality'
)

In [None]:
tuner.search_space_summary()

In [None]:
tuner.search(X_train, y_train, epochs = 1000, validation_data = (X_val, y_val), callbacks = [early_stopping, lr_schedule, tensorboard])