# IMPORT LIBRARIES

In [None]:
import sklearn.datasets
import pandas as pd
import numpy as np

import os

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifier
from sklearn.svm import SVC
from sklearn.tree  import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB


import pickle 

# IMPORT DATA

In [140]:
def data_connection():
    # data_connection.py
    # import sklearn.datasets
    # import pandas as pd

    # importing data from sklearn library
    data_dict = sklearn.datasets.load_iris()

    # Creating data frame with X values and with his features names
    df_data_raw = pd.DataFrame(data_dict.data, columns = data_dict.feature_names)

    # Creating the Y column
    df_data_raw['target'] = data_dict.target

    # Creating a dictionary with the Y value and his class
    dict_target_names = dict(zip(df_data_raw['target'].unique(), data_dict.target_names))
    
    # Creating a columns in the dataframe with the target classes names
    df_data_raw['class'] = df_data_raw['target'].map(dict_target_names)

    # Exporting the raw data dataframe to posterior use
    df_data_raw.to_csv(r'input/raw_data.csv', index=False)

data_connection()

# CREATING FOLDS

In [163]:
# data_splitter.py

    # import os

    # import pandas as pd
    # from sklearn import model_selection

CSV_NAME = 'raw_data.csv'
CSV_DIR = 'input'

# Reads raw data csv from the input folder
df = pd.read_csv(os.path.join(CSV_DIR,CSV_NAME))

# Shuffle
df = df.sample(frac=1, replace = False).reset_index(drop=True)

# Initialize the fold class
kf = model_selection.KFold(n_splits = 5)

# Create  kfold column
df['kfold'] = 0

# Fill the kfold column
for fold, (trn_, val_) in enumerate(kf.split(X=df)):
    df.loc[val_, 'kfold'] = fold

# Print distribution of classes in each fold
print(pd.pivot_table(df, index='kfold', columns='class', aggfunc='size').reset_index(drop=True))

# Export to input folder the train data splitted in N folds
df.to_csv('input/raw_data_splitted.csv', index = False)

class  setosa  versicolor  virginica
0           8          10         12
1           7          13         10
2           9          10         11
3          10          11          9
4          16           6          8


# FINDING BEST MODELS CANDIDATES

In [5]:
    # from sklearn.linear_model import LogisticRegression
    # from sklearn.linear_model import RidgeClassifier
    # from sklearn.svm import SVC
    # from sklearn.tree  import DecisionTreeClassifier
    # from sklearn.naive_bayes import GaussianNB

    # import pandas as pd
    # import numpy as np

def model_explorer(df, model):
    # Initialize Linear Regression Model class
    clf = model()

    # Training on fold 0 and test in folds 1,2,3,4
    X_train = df.loc[df['kfold'] == 0].iloc[:, :-3] 
    Y_train = df.loc[df['kfold'] == 0].loc[:, 'target']
    clf_fit = model().fit(X_train, Y_train)

    # Print coeficient of determinario R² for other folds of the model
    print('== RESULTS FROM {model}'.format(model = model.__name__))
    scores = np.array([])
    for fold in range(1,5):
        X = df.loc[df['kfold'] == fold].iloc[:, :-3] 
        Y = df.loc[df['kfold'] == fold].loc[:, 'target']
        score = clf_fit.score(X,Y)
        print('Score in fold {fold} is {score:.2%}.'.format(fold = fold, score = score))
        scores = np.append(arr = scores, values = score)
    print('Average is {avg:.2f} and deviation is {std:.2f}\n'.format(avg = np.mean(scores), std = np.std(scores)))


df = pd.read_csv('input/raw_data_splitted.csv')

model_explorer(df,LogisticRegression)
model_explorer(df,RidgeClassifier)
model_explorer(df,SVC)
model_explorer(df,DecisionTreeClassifier)
model_explorer(df,GaussianNB)

== RESULTS FROM LogisticRegression
Score in fold 1 is 86.67%.
Score in fold 2 is 83.33%.
Score in fold 3 is 90.00%.
Score in fold 4 is 100.00%.
Average is 0.90 and deviation is 0.06

== RESULTS FROM RidgeClassifier
Score in fold 1 is 73.33%.
Score in fold 2 is 73.33%.
Score in fold 3 is 86.67%.
Score in fold 4 is 86.67%.
Average is 0.80 and deviation is 0.07

== RESULTS FROM SVC
Score in fold 1 is 93.33%.
Score in fold 2 is 90.00%.
Score in fold 3 is 93.33%.
Score in fold 4 is 96.67%.
Average is 0.93 and deviation is 0.02

== RESULTS FROM DecisionTreeClassifier
Score in fold 1 is 86.67%.
Score in fold 2 is 96.67%.
Score in fold 3 is 93.33%.
Score in fold 4 is 96.67%.
Average is 0.93 and deviation is 0.04

== RESULTS FROM GaussianNB
Score in fold 1 is 93.33%.
Score in fold 2 is 93.33%.
Score in fold 3 is 93.33%.
Score in fold 4 is 100.00%.
Average is 0.95 and deviation is 0.03



Best models are: GaussianNB, DecisionTreeClassifier and SVC

# Exploring Hyper Parameters

Post Poned, business need this model in production ASAP.

# TRAINING AND SAVING

In [76]:
# Training an GaussianNB model and saving locally

from sklearn.naive_bayes import GaussianNB

import pandas as pd
import pickle

MODELS_DIRECTORY = 'models/'


# depois fazer isso aceitar hiperparametros de treino
def model_trainer_no_hp(df_train_data_preprocessed, model):
    X_train = df_train_data_preprocessed.iloc[:, :-3] 
    Y_train = df_train_data_preprocessed.loc[:, 'target']

    model_initialized = model()
    model_fitted = model_initialized.fit(X_train, Y_train)
    return model_fitted


# Save the fitted model in the models folder with name specified
def model_saver(model_fitted, models_folder, model_name):
    pickle.dump(model_fitted, open(models_folder + "{model_name}.pkl".format(model_name = model_name), 'wb'))

# testing both functions
train_data = pd.read_csv('input/raw_data_splitted.csv')
train_data = train_data.loc[train_data['kfold'] == 0]

model_trained = model_trainer_no_hp(train_data, GaussianNB)

model_saver(model_trained, MODELS_DIRECTORY, 'plain_gaussian')

# MODEL INFERENCE

In [173]:
# Loads Models and uses it based on entrance

    # import pickle 

MODELS_DIRECTORY = 'models/'

def model_loader(models_folder, models_name):
    return pickle.load(open(models_folder + models_name + '.pkl', 'rb'))
    
loaded_model = model_loader(MODELS_DIRECTORY, 'plain_gaussian')
X = np.array([5.5,2.3,4.0,1.3])
classes_name_dict = {0: 'setosa', 1: 'versicolor', 2: 'virginica'}

def model_predict(X, classes_name_dict, loaded_model):
    X = np.array([X])
    target_value = loaded_model.predict(X)[0].tolist()
    predicted_class = classes_name_dict[target_value]
    return [X[0].tolist(), target_value, predicted_class]

output = model_predict(X = X, classes_name_dict = classes_name_dict, loaded_model= loaded_model)

# Formatting output

In [172]:
features_names = ['sepal length (cm)',
                  'sepal width (cm)',
                  'petal length (cm)',
                  'petal width (cm)']

output_X_values_list = [float(i) for i in output[0][0]]
class_number = output[1]
class_name = output[2]

output_dict = {'query' : dict(zip(features_names, output_X_values_list)), 'class_number' : output_X_values_list, 'class_name' : class_name}
output_dict

{'query': {'sepal length (cm)': 5.5,
  'sepal width (cm)': 2.3,
  'petal length (cm)': 4.0,
  'petal width (cm)': 1.3},
 'class_number': [5.5, 2.3, 4.0, 1.3],
 'class_name': 'versicolor'}