<a href="https://colab.research.google.com/github/badalahmmed/Data-science/blob/master/iris_dataset_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Steps in Applied Machine Learning & Data Science :
 1. Load Library
 2. Load Dataset to which Machine Learning Algorithm to be applied
    Either
     1. load from a CSV file or
     2. load from a Database   
 3. Summarisation of Data to understand dataset (Descriptive Statistics) or Preliminary Analysis
 4. Visualisation of Data to understand dataset (Plots, Graphs etc.)
 5. Data pre-processing & Data transformation (split into train-test datasets)
 6. Application of a Machine Learning Algorithm to training dataset 
   1. setup a ML algorithm and parameter settings
   2. HPO using grid, baysian and, cross validation with training dataset
   3. training & fitting Algorithm with training Dataset
   4. evaluation of trained Algorithm (or Model) and result
   5. saving the trained model for future prediction
 7. Load the saved model and apply it to new dataset for prediction

#Load Library modules

In [None]:
pip install scikit-plot


In [None]:
pip install scikit-optimize

In [None]:
pip install sklearn

In [None]:
import pandas as pd
import pickle as pk
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scikitplot as skplt
from   sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,RepeatedStratifiedKFold
from   sklearn.ensemble import RandomForestClassifier
from   sklearn.metrics import accuracy_score, classification_report, cohen_kappa_score, confusion_matrix

#Load Dataset

###Mount Google Drive and set the paths of datasets


In [None]:
from google.colab import drive
drive.mount('/content/drive')
import sys
sys.path.insert(0, '/content/drive/MyDrive/Random Forest Classifiers/Iris dataset analysis/Dataset')

###function for csv file loading

In [None]:
def load_csv_dataset_withoutFeatureName(filename):
        #define the column Names
        col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'Species']
        #read the csv data
        dataset = pd.read_csv(filename, sep = ',', names = col_names)
        #print data shape, some data and column names
        print("Data Shape", dataset.shape)
        print("...............................................")
        print("Datasets's Head", dataset.head(5)) 
        print("...............................................")
        print("Column Names",dataset.columns)
        print("...............................................")
        feature_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
        target = 'Species'
        return feature_names, target, dataset

In [None]:
def load_csv_dataset(filename):
  dataset = pd.read_csv(path, sep = ',')
  #print data shape, some data and column names
  print("Data Shape", dataset.shape)
  print("...............................................")
  print( dataset.head(5)) 
  print("...............................................")
  print();print("Column Names",dataset.columns)
  print("...............................................")
  feature_names = dataset.columns[:4]
  target = dataset.columns[-1:][0]
  return feature_names, target, dataset


In [None]:
path="/content/drive/MyDrive/Random Forest Classifiers/Iris dataset analysis/Dataset/Iris.csv"

In [None]:
feature_names, target, dataset =load_csv_dataset(path)

#Preliminary data analysis


1. Information About Missing Values
2. Information about Feature Variables
3. Correlation
3. Ranking of Correlation Coefficients
4. Highly correlated variables (Absolute Correlations)
5. Information about the target variable

In [None]:
# -------------------------------------------------------------------------
# Helper modules for Descriptive Statistics
# -------------------------------------------------------------------------    
def get_redundant_pairs(df):
        pairs_to_drop = set()
        cols = df.columns
        for i in range(0, df.shape[1]):
            for j in range(0, i+1):
                pairs_to_drop.add((cols[i], cols[j]))
        return pairs_to_drop

def get_top_abs_correlations(df, n=5): 
        au_corr = df.corr().unstack()
        labels_to_drop = get_redundant_pairs(df)
        au_corr = au_corr.drop(labels=labels_to_drop).sort_values(ascending=False)
        return au_corr[0:n]

def corrank(X):
        import itertools
        df = pd.DataFrame([[(i,j), 
                   X.corr().loc[i,j]] for i,j in list(itertools.combinations(X.corr(), 2))],
                   columns=['pairs','corr'])
        print(df.sort_values(by='corr',ascending=False))
        print()

In [None]:
# -------------------------------------------------------------------------
#  function of descriptive statistics and correlation matrix 
# -------------------------------------------------------------------------    
def data_descriptiveStats(feature_names, target, dataset):
        # Count Number of Missing Value on Each Column    
        print(); print('Count Number of Missing Value on Each Column: ')        
        print(); print(dataset[feature_names].isnull().sum(axis=0))
        print(); print("Target Missing",dataset[target].isnull().sum(axis=0))    
        print("...............................................")  
        
        # Get Information on the feature variables
        print(); print('Get Information on the feature variables: ')            
        print(); print(dataset[feature_names].info())
        print(); print(dataset[feature_names].describe())
        print("...............................................")

        # correlation
        print();print("Corelation")
        pd.set_option('precision', 2)
        print(); print(dataset[feature_names].corr(method ='pearson'))    
        print("...............................................")

        # Ranking of Correlation Coefficients among Variable Pairs
        print(); print("Ranking of Correlation Coefficients:")    
        corrank(dataset[feature_names])
        print("...............................................")

        # Print Highly Correlated Variables
        print(); print("Highly correlated variables (Absolute Correlations):")
        print(); print(get_top_abs_correlations(dataset[feature_names], 8))
        print("...............................................")

        # Get Information on the target    
        print("Target Descriptions")
        print(); print(dataset[target].describe())  
        print("Target groups")
        print(); print(dataset.groupby(target).size())

#Data Visualization


1. Box plot of each Numerical Features
2. Histogram plot of each Numerical Features
3. Co-relation metrix of  Numerical Features
4. Scatter Matrix Plot
5. Pie chart for target

In [None]:
#boxplot function
def Box_Plot(feature_names):
  i = 1
  print(); print('BOX plot of each numerical features')
  plt.figure(figsize=(11,9)) 
  plt.suptitle('Box Plot') 
  for col in feature_names:
      plt.subplot(2,2,i)
      plt.title( col)
      plt.axis('on')
      plt.tick_params(axis='both', left=True, top=False, right=False, bottom=True, 
                      labelleft=False, labeltop=False, labelright=False, labelbottom=False)
      dataset[col].plot(kind='box', subplots=True, sharex=False, sharey=False)
      
      
      i += 1
  plt.show() 

In [None]:
#histogram function
def Histogram_Plot(feature_names):
  j = 1
  print(); print('Histogram of each Numerical Feature')
  plt.figure(figsize=(11,9))  
  plt.suptitle('Box Plot')    
  for col in feature_names:
      plt.subplot(2,2,j)
      plt.title( col)
      plt.axis('on')
      plt.tick_params(axis='both', left=True, top=False, right=False, bottom=False, 
                      labelleft=False, labeltop=False, labelright=False, labelbottom=False)
      dataset[col].hist()
      j += 1
  plt.show()

In [None]:
# correlation matrix
def Corelation_Matrix_Plot(dataset):
  print(); print('Heat of each Numerical Feature')
  fig, ax = plt.subplots()
  plt.title("correlation matrixplot or Heat Plot")
  corr = dataset.corr()
  ax = sns.heatmap(
      corr, 
      vmin=-1, vmax=1, center=0,
      cmap=sns.diverging_palette(20, 220, n=200),
      square=True
  )
  ax.set_xticklabels(
      ax.get_xticklabels(),
      rotation=45,
      horizontalalignment='right'
  );

In [None]:
def Pair_Plot(dataset):
  # PairPlot using seaborn
  print(); print('Scatter Matrix Plot')
  sns.pairplot(dataset, hue = 'Species')
  plt.show()

In [None]:
def Pie_Chart(target):
  # Pie chart for Categorical Variables
  print(); print('PIE Chart of for Target: ')
  plt.figure(figsize=(11,9)) 
  i = 1
  for colName in [target]:
      labels = []; sizes = [];
      df = dataset.groupby(colName).size()
      for key in df.keys():
          labels.append(key)
          sizes.append(df[key])
      # Plot PIE Chart with %
      plt.subplot(2,2,i)
      plt.axis('on')
      plt.tick_params(axis='both', left=False, top=False, right=False, bottom=False, 
                      labelleft=True, labeltop=True, labelright=False, labelbottom=False)        
      plt.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=140)
      plt.axis('equal')
      i += 1; plt.savefig('Piefig.pdf', format='pdf')
  plt.show()    

In [None]:
def Data_Visualization(dataset, feature_names, target):
  Box_Plot(feature_names)
  Histogram_Plot(feature_names)
  Corelation_Matrix_Plot(dataset)
  Pair_Plot(dataset)
  Pie_Chart(target)

In [None]:
Data_Visualization(dataset, feature_names,target)

#Data pre-processing & Data transformation (split into train-test datasets)


1. split dataset as train ans test
2. Using PCA

###Split the dataset into train and test set

In [None]:
def data_split(feature_names, target, dataset):
        # Data Transform - Split train : test datasets
        X_train, X_test, y_train, y_test = train_test_split(dataset.loc[:, feature_names], 
                                                            dataset.loc[:, target], test_size=0.33)
        return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = data_split(feature_names, target, dataset)

###Apply PCA 

In [None]:
#Function for Standarize the data
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
def Standardarization(dataset, feature_names):
  x = dataset.loc[:, feature_names].values
  x=StandardScaler().fit_transform(x)
  return x

#Function for Dimention reduction
def Dimention_Reduction(n_components, Standard_Dataset):
  pca = PCA(n_components)
  principalComponents = pca.fit_transform(Standard_Dataset)
  principalDf = pd.DataFrame(data = principalComponents, columns = ['principal component 1', 'principal component 2'])
  return principalDf


#Function for Visualize 2D projections
def Visualiza2DProjection(DimentionReducedDf):
  fig = plt.figure(figsize = (8,8))
  ax = fig.add_subplot(1,1,1) 
  ax.set_xlabel('Principal Component 1', fontsize = 15)
  ax.set_ylabel('Principal Component 2', fontsize = 15)
  ax.set_title('2 Component PCA', fontsize = 20)
  targets = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
  colors = ['r', 'g', 'b']
  for target, color in zip(targets,colors):
      indicesToKeep = DimentionReducedDf['Species'] == target
      ax.scatter(DimentionReducedDf.loc[indicesToKeep, 'principal component 1']
                , DimentionReducedDf.loc[indicesToKeep, 'principal component 2']
                , c = color
                , s = 50)
  ax.legend(targets)
  ax.grid()

In [None]:
StandardData = Standardarization(dataset, feature_names)
principalDf=Dimention_Reduction(2,StandardData)
finalDf = pd.concat([principalDf, dataset[['Species']]], axis = 1) #add targets or Species with DimentionReduced Data
Visualiza2DProjection(finalDf)


#Application of a Machine Learning Algorithm to training dataset


1. setup a ML algorithm and parameter settings
2. Hyperparameter tuning by grid search, Baysian Search and kfold cross validation and training model with best HP
3. evaluation of trained Algorithm (or Model) and result


###Train the model


In [None]:
def training_model(X_train, y_train):
        model = RandomForestClassifier(n_estimators=100, criterion='gini', 
                                  max_depth=8, min_samples_split=2, 
                                  min_samples_leaf=1, min_weight_fraction_leaf=0.0, 
                                  max_features=None, max_leaf_nodes=None, 
                                  min_impurity_decrease=0.0, 
                                  bootstrap=True, oob_score=False, 
                                  warm_start=False, class_weight=None)
        return model

In [None]:
model = training_model(X_train, y_train)

##Hyperparameter Tuning

### Grid Search

In [None]:
def GridSearch(model, X_train,y_train,parameters, kfold):
  grid = GridSearchCV(estimator=model, param_grid = parameters, cv = kfold, 
                    verbose = 1, n_jobs = -1, refit = True)
  grid.fit(X_train, y_train)

  # Results from Grid Search
  print("\n========================================================")
  print(" Results from Grid Search " )
  print("========================================================")    
  print("\n The best estimator across ALL searched params:\n",
      grid.best_estimator_)
  print("\n The best score across ALL searched params:\n",
      grid.best_score_)
  print("\n The best parameters across ALL searched params:\n",
      grid.best_params_)
  print("\n ========================================================")

  return(grid.best_estimator_)

###try to see the efficiencies of hpo search

###Baysian Search

In [None]:
from skopt import BayesSearchCV
def BaysianSearch(model, X_train, y_train,parameters, kfold ):
  # define the search
  baysian = BayesSearchCV(estimator=model, search_spaces = parameters, n_jobs=-1, cv=kfold)
  baysian.fit(X_train, y_train)
  # report the best result
  print("Baysian score",baysian.best_score_)
  print("Baysian Best param", baysian.best_params_)
  return(baysian.best_estimator_)

###Use cross validation

In [None]:
def cross_validatin_and_fitting(model, X_train, y_train, kfold):
        cv_results = cross_val_score(model, X_train, y_train, cv = kfold, scoring = 'accuracy', 
                                 n_jobs = -1, verbose = 1)
        # Cross Validation Results
        print()
        print("Cross Validation results: ", cv_results)
        prt_string = "CV Mean Accuracy: %f (Std: %f)"% (cv_results.mean(), cv_results.std())
        print(prt_string)
        
        # Final fitting of the Model
        model.fit(X_train, y_train)
        
        print(); print('========================================================')
        print(); print(model.get_params(deep = True))
        print(); print('========================================================')        
        
        # plot learning Curves
        skplt.estimators.plot_learning_curve(model, X_train, y_train, figsize=(8,6))
        plt.show()
        
        return model

###Apply all the search

In [None]:
  #Define search Space
  parameters = {'max_depth'     : [4,10,12,15],
                        'criterion'     : ['gini', 'entropy'],
                        'max_features'  : ['auto', 'sqrt', 'log2'],
                        'n_estimators'  : [50,100, 250, 500]
                        # Add more parameters here for tuning
                        # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
                        }
kfold = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

In [None]:
model = cross_validatin_and_fitting(model, X_train, y_train,kfold)
GridSearch(model, X_train,y_train,parameters, kfold)
BaysianSearch(model, X_train, y_train,parameters, kfold)


##Evalute methods

Evaluate techniques:
1. Confusion matrix
2. Accuracy
3. Precision
4. Recall
5. Specificity
6. F1 score
7. Precision-Recall or PR curve
8. ROC (Receiver Operating Characteristics) curve
9. PR vs ROC curve.

Some common terms to be clear with are:
1. True positives (TP): Predicted positive and are actually positive.
2. False positives (FP): Predicted positive and are actually negative.
3. True negatives (TN): Predicted negative and are actually negative.
4. False negatives (FN): Predicted negative and are actually positive.

In [None]:
# -----------------------------------------------
# Evaluate the skill of the Trained model
# -----------------------------------------------
def evaluate_model(model, X_test, y_test):
        pred_Class          = model.predict(X_test)
        acc                 = accuracy_score(y_test, pred_Class)
        classReport         = classification_report(y_test, pred_Class)
        confMatrix          = confusion_matrix(y_test, pred_Class) 
        kappa_score         = cohen_kappa_score(y_test, pred_Class)         
        
        print(); print('Evaluation of the trained model: ')
        print(); print('Accuracy : ', acc)
        print(); print('Kappa Score : ', kappa_score)
        print(); print('Confusion Matrix :\n', confMatrix)
        print(); print('Classification Report :\n',classReport)

        pred_proba = model.predict_proba(X_test)
        
        # Add more plots here using scikit-plot
        # ROC curves
        skplt.metrics.plot_roc(y_test,pred_proba,figsize=(8,6)); plt.show()

        # Confusion matrix
        skplt.metrics.plot_confusion_matrix(y_test,pred_Class,figsize=(8,6)); plt.show()        

        # precision recall curve
        skplt.metrics.plot_precision_recall(y_test, pred_proba, 
                title='Precision-Recall Curve', plot_micro=True, 
                classes_to_plot=None, ax=None, figsize=(8,6), 
                cmap='nipy_spectral', title_fontsize='large', 
                text_fontsize='medium'); plt.show()        
        
        # Add more ... ... ...
        
        return model

model = evaluate_model(model, X_test, y_test)