<h1>End 2 End Machine Learning Workflow</h1>

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt

%matplotlib inline

<h3>1. Framing the problem</h3>

<h4>a. Frame the problem (supervised, unsupervised, reinforced, online / offline)</h4>

<h4>b. Performance measurement (minimum performance needed)</h4>

In [None]:
# Some metrics thingies
from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores)

# Precision vs recall
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0, 1])
    
pr, tpr, thresholds = roc_curve(y_train_5, y_scores)

# ROC
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

# Confusion matrix
conf_matrix = confusion_matrix(y_train_5, y_scores)
plt.matshow(conf_matrix, cmap=plt.cm_gray)
plt.show()

# Plotting train and validation curves
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

def plot_learning_curves(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
    train_errors, val_errors = [], []
    for m in range(1, len(X_train)):
        model.fit(X_train[:m], y_train[:m])
        y_train_predict = model.predict(X_train[:m])
        y_val_predict = model.predict(X_val)
        train_errors.append(mean_squared_error(y_train_predict, y_train[:m]))
        val_errors.append(mean_squared_error(y_val_predict, y_val))
    plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
    plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")

<h4>c. List of assumptions</h4>

<h3>2. Importing the datasets</h3>

<h4>a. Importing the datasets</h4>

_Importing the datasets into the memory_

In [None]:
ROOTDIR = r'C:\Users\amalf\OneDrive\Projects\05 - HandsonML\Dataset\handson-ml-master\datasets\housing'
filename = r'housing.csv'
dataset = os.path.join(ROOTDIR, filename)

df = pd.read_csv(dataset)

Do a quick check on the dataset

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

<h4>b. Check size and type of the data</h4>

Check categorical data if any<br/>
*ocean_proximity*

In [None]:
df['ocean_proximity'].value_counts()

Check for simple data distribution, scale the bins if necessary

In [None]:
df.hist(bins=50, figsize=(20,15))

<h4>c. Sample test set after shuffling (never look at it)</h4>

Using stratified sampling if the number of instances is small

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
strashuff = StratifiedShuffleSplit(n_splits=1, test_size=0.1, train_size=0.9, random_state=42)
train_set = None
test_set = None
for train_ix, test_ix in strashuff.split(df, target):
    train_set = df.loc[train_ix]
    test_set = df.loc[test_ix]

<h3>3. Explore the datasets</h3>

<h4>a. Create a copy of the data for exploration</h4>

<h4>b. Study each attribute:</h4>

i. name

ii. type

iii. % of missing values

iv. noisiness

v. useful for performance measure?

vi. type of distribution

<h4>c. Visualize the data</h4>

<h4>d. Correlation between attributes</h4>

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
attributes = ['median_house_value','median_income',
             'total_rooms','housing_median_age']
scatter_matrix(train_set[attributes], figsize=(12,8))

In [None]:
train_set.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)

<h3>4. Data preprocessing</h3>

<h4>a. Data cleaning</h4>

i. remove outliers

ii. fill in missing values (zero, mean, median)

In [None]:
from sklearn.preprocessing import Imputer

imp = Imputer(strategy="median")

<h4>b. Feature selection</h4>

In [None]:
from sklearn.ensemble import RandomForestClassifier

<h4>c. Feature engineering</h4>

i. discretize continuous values

ii. add promising transformation features (log, sqrt, power)

In [None]:
# PCA for visualization
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)

iii. aggreate features

<h4>d. Feature scaling</h4>

In [38]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Imputer

# Selecting specific dataframe columns
num_cols = ['']

pipeline = Pipeline([
    ('imputer', Imputer(strategy="median")),
    ('std_scaler'), StandardScaler()
])

# etc.

SyntaxError: invalid syntax (<ipython-input-38-c77641d8d913>, line 6)

<h3>5. Shortlist promising models</h3>

<h4>a. Train as much as dirty models, using sampling to speed up is recommended</h4>

<h4>b. Use N-folds cross val to determine mean and std of each model</h4>

In [None]:
from sklearn.model_selection import cross_val_score

# Negation of MSE
scores = cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=10)
rmse_scores = np.sqrt(-scores)

<h4>c. Analyze the most significant variables of each model</h4>

<h4>d. Short list top 5 of the models</h4>

<h3>6. Fine tune the models</h3>

<h4>a. Depending on the search space, define grid search / random search</h4>

In [None]:
# Grid search
from sklearn.model_selection import GridSearchCV

param_grid = [
    {'n_estimators':[3,10,30], 'max_features'=[2,4,6,10]},
    {'bootstrap':[False],'n_estimators':[3,10,30], 'max_features'=[2,4,6,10]}
]

# model = some model
grid_search = GridSearchCV(model, param_grid, cv=10, scoring="neg_mean_squared_error")
grid_search.fit(X,y)

grid_search.best_estimator_
grid_search.best_params_

# if the search space too large, use RandomSearchCV
from sklearn.model_selection import RandomizedSearchCV

# RandomForest can be used for feature importance rather than PCA
feature_importance = grid_search.best_estimator_.feature_importances_

<h4>b. Try ensemble methods</h4>

In [None]:
from sklearn.ensemble import BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier