In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import date

from sklearn import datasets, ensemble
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

from sklearn import datasets
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
import pickle
import joblib

from sklearn import linear_model
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression

In [None]:
flowers = pd.read_csv('./iris-data.csv')

In [None]:
flowers.describe()

In [None]:
flowers.info()

In [None]:
flowers[flowers['petal_width_cm'].isnull()]


In [None]:
flowers = flowers.dropna()
flowers.describe()

In [None]:
flowers.info()

In [None]:
sns.boxplot(data=flowers, x="sepal_width_cm", y="class", hue="class", dodge=False)

In [None]:
sns.boxplot(data=flowers, x="sepal_length_cm", y="class", hue="class", dodge=False)

In [None]:
sns.boxplot(data=flowers, x="petal_length_cm", y="class", hue="class", dodge=False)

In [None]:
sns.boxplot(data=flowers, x="petal_width_cm", y="class", hue="class", dodge=False)

In [None]:
flowers['class'].unique()

### Missing values
### Encoding


In [None]:
flowers_encoding = flowers.replace({'Iris-setosa': 1 , 'Iris-setossa': 2, 'Iris-versicolor': 3, 'versicolor': 4,
       'Iris-virginica': 5}, regex = True)

In [None]:
flowers_encoding

In [None]:
flowers.sort_values(by = 'sepal_length_cm').head(15)

In [None]:
flowers_drop_sepal_lenght_outiers = flowers.drop([flowers.index[80], flowers.index[81], flowers.index[79], flowers.index[78], flowers.index[77]])
flowers_drop_sepal_lenght_outiers.sort_values(by = 'sepal_width_cm')

In [None]:
flowers_drop_sepal_lenght_outiers.corr()

In [None]:
flowers_encoding.corr()

In [None]:
encoding = {'Iris-setosa': 1 , 'Iris-setossa': 2, 'Iris-versicolor': 3, 'versicolor': 4,
       'Iris-virginica': 5}
def class_encoding(x):
    for key in encoding:
        if x == key:
            return encoding[key]

In [None]:
flowers['Class_encoding'] = flowers_drop_sepal_lenght_outiers['class'].apply(class_encoding)
flowers 

In [None]:
flowers.info()

In [None]:
flowers[flowers['class'].isin(['Iris-versicolor'])]


individual stats

In [None]:
flowers[flowers['Class_encoding'].isnull()]

In [None]:
flowers = flowers.fillna(3.0)
flowers.columns

In [None]:
flowers_test = flowers[['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm',
       'petal_width_cm', 'Class_encoding']]
flowers_test

In [None]:
cat_cols = ['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm',
       'petal_width_cm', 'Class_encoding']
flowers_one_hot_encoding = pd.get_dummies(flowers[cat_cols], 
                                          columns=['Class_encoding'])
flowers_one_hot_encoding.columns

### Scaling

Standarization, Z-Score

In [None]:
# Using scikit-learn .StandardScaler()

scaler_st = StandardScaler()
scaled_flowers_one_st = scaler_st.fit_transform(flowers_one_hot_encoding)
scaled_flowers_one_st

In [None]:
scaled_df1 = pd.DataFrame(scaled_flowers_one_st, columns=['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm',
       'petal_width_cm', 'Class_encoding_1.0', 'Class_encoding_2.0',
       'Class_encoding_3.0', 'Class_encoding_4.0', 'Class_encoding_5.0'])
scaled_df1

Normalization, MinMax

In [None]:
# Using scikit-learn .MinMaxScaler()

scaler_norm = MinMaxScaler()
scaled_flowers_one_norm = scaler_norm.fit_transform(flowers_one_hot_encoding)
scaled_flowers_one_norm

In [None]:
scaled_df2 = pd.DataFrame(scaled_flowers_one_norm, columns=['sepal_length_cm', 'sepal_width_cm', 'petal_length_cm',
       'petal_width_cm', 'Class_encoding_1.0', 'Class_encoding_2.0',
       'Class_encoding_3.0', 'Class_encoding_4.0', 'Class_encoding_5.0'])
scaled_df2

---

# Bonus, proof tests

### Feature selection

### Feature engineering

In [None]:
# Numerical binning

'''vehicles['num_bin'] = pd.cut(vehicles['Fuel Cost/Year'], bins=3, labels=["Low", "Mid", "High"])
vehicles['num_bin'].unique()'''

### Modeling

In [None]:
y_col = ['Class_encoding']
y = flowers_test[y_col]
X = flowers_test[flowers_test.columns.drop(y_col)]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"X_train: {X_train.shape}, X_test: {X_test.shape}, y_train: {y_train.shape}, y_test: {y_test.shape}")
#print(f"X_train: {type(X_train)}, X_test: {type(X_test)}, y_train: {type(y_train)}, y_test: {type(y_test)}")

In [None]:
%%time

# Model definition

model = LinearRegression()
#model = linear_model.Lasso()
#model = Ridge()
#model = ElasticNet()

#model = SVR()
#model = SGDRegressor()

hyperparameters = model.get_params()

print(type(model), '\n')
print('Model hyperparameters:', hyperparameters, '\n')

In [None]:
%%time

# Model training

model.fit(X_train, y_train)

print('Model:', model, '\n')
print('Model hyperparameters:', hyperparameters, '\n')
print('Model coefficients:', model.coef_, '\n')

In [None]:
%%time

# Model predictions

predictions = model.predict(X_test)

print(type(predictions))

In [None]:
# Visual check

check = pd.DataFrame({'Ground truth':y_test, 'Predictions':predictions, 'Diff':y_test-predictions})
check

In [None]:
check.reset_index(inplace=True)

check.plot(x='index', y=['Ground truth', 'Predictions'], kind='line', figsize=(10, 3));

## Saved model

In [None]:
# Save model using joblib

filename = './models/linearregression_002.sav'
joblib.dump(model, filename)
print('Your model has been saved with joblib!!!')

In [None]:
# Load model using joblib

filename = './models/linearregression_002.sav'
loaded_model = joblib.load(filename)
print('Model coefficients:', model.coef_, '\n')
print('Loaded model coefficients:', loaded_model.coef_)