# Author: Kumar Awanish,
Content: Impementation of Binary Mode(Approach A) and Classifying Attacks(Approach B),
Technology used: Python3,Spark(PySpark),tsne

In [2]:
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer, VectorAssembler, IndexToString
from pyspark.sql.functions import *
from pyspark.context import SparkContext
from pyspark.serializers import MarshalSerializer
import time
import numpy as np
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.functions import when 
from functools import reduce
from pyspark.sql import DataFrame
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import classification_report

# Setting Spark enviroment

In [3]:
import os
memory = '20g'
pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell'
os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args
SparkContext.setSystemProperty('spark.executor.memory', '6g')
#SparkContext.setSystemProperty('spark.driver.memory', '10g')
sc = SparkContext('local','example')  # if using locally
sql_sc = SQLContext(sc)

# Loading the data

In [4]:
def data_loading(dataset):
    """
    This function will load dataset using Spark cluster.
    :param dataset: dataset to load and process
    :return: a Spark dataframe
    """
    dataset=sql_sc.read.format('csv').options(header='true', inferSchema='true').load(dataset)
    #changing column header name
    dataset = dataset.select(*[col(s).alias('Label') if s == ' Label' else s for s in dataset.columns])
    #to change datatype
    dataset=dataset.drop('External IP')
    dataset=dataset.filter(dataset.Label!=' Label')#filter Label from label
    return dataset

# Visualisation of Lables 

In [5]:
from matplotlib import pyplot as plt
%matplotlib inline
def data_plot(dataset):
    """
    This function is to visualise numbers of labels and their respective records.
    :param dataset: a spark dataframe
    """
    label_counts=dataset.groupBy('Label').count().collect()
    categories = [i[0] for i in label_counts[0:]]
    counts = [i[1] for i in label_counts[0:]]
    ind = np.array(range(len(categories)))
    width = 0.55
    # Set the figure size
    fig = plt.figure(1, [40, 20])
    plt.bar(ind, counts, width=width, color='r')
    plt.ylabel('counts')
    plt.title('Response distribution')
    plt.xticks(ind + width/2., categories,fontsize=12)
    plt.yscale('log')
    plt.show()
    #plt.colors(color=[0,16])

# Data Processing

In [5]:
def data_preprocessing(dataset):
    """
    This function is for preprocessing of datasets.
    :param dataset: a spark dataframe
    :return dataWithFeatures: a spark dataframe after preprocessing 
    """
    featureList=[' Flow Duration', ' Fwd IAT Min', ' Bwd IAT Mean', ' Fwd IAT Mean','Init_Win_bytes_forward',' Subflow Fwd Bytes','Total Length of Fwd Packets',
      ' ACK Flag Count', ' Active Min', 'Active Mean',' Flow IAT Std','Init_Win_bytes_forward','Fwd PSH Flags',' SYN Flag Count',
      'Fwd Packets/s',' Bwd Packet Length Std','Total Length of Fwd Packets','Init_Win_bytes_forward',' Init_Win_bytes_backward','Total Length of Fwd Packets',
      'Total Length of Fwd Packets','Active Mean','Total Length of Fwd Packets',' Fwd Packet Length Mean',' Average Packet Size','Init_Win_bytes_forward', ' Bwd Packets/s', ' PSH Flag Count', ' Flow IAT Min', ' Fwd IAT Min', ' Flow IAT Mean']
    uniqueFeature=list(set(featureList))
    uniqueFeature.append('Label')
    # attack labels to encode itno 0
    labels=["DoS Slowhttptest",'Web Attack � Brute Force','Web Attack � Sql Injection','Web Attack � XSS',"SSH-Patator","DoS GoldenEye", "Heartbleed", "DoS Hulk", "DoS slowloris", "FTP-Patator", "Infiltration","Bot","PortScan","DDoS"]
    #change benign to 1 else 0
    newDf = dataset.withColumn('Label',when(dataset.Label.isin(labels),0).otherwise(1))
    #to change datatype
    final_data=newDf.select(*(col(c).cast("float").alias(c) for c in newDf.columns))
    final_data = final_data.filter(final_data.Label.isNotNull())
    final_data = final_data.na.fill(0.0)
    print(final_data.groupBy('Label').count().collect())
    #print(final_data.select('Label').show())
    dataWithFeatures=final_data.select([c for c in final_data.columns if c in uniqueFeature])
    #print(dataWithFeatures.columns)
    return dataWithFeatures 

In [6]:
def sampling(dataWithFeatures):
    """
    This function is to sample dataset
    :param dataWithFeatures: preprocessed spark dataframe
    :return dataWithSampling: a samsple sparked dataframe
    """
    dataWithSampling=dataWithFeatures.sampleBy('Label',fractions={0:1.0, 1: 471454./2647898})
    #dataWithSamples.count()
    print(dataWithSamples.groupBy('Label').count().collect())
    return dataWithSampling

In [7]:
def vectorAssembler(dataWithFeatures):
    """
    This function is for creating feature indexer, which will be helpful in running RF model on PySpark Api.
    :param dataWithFeatures: preprocessed spark dataframe
    :return dataWithFeatures: spark dataframe with feature indexer column added to it
    """
    stages = [] # stages in our Pipeline
    assemblerInputs=dataWithFeatures.columns[0:-1]
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features_new")
    #assembler.transform(final_data)
    assembler.transform(dataWithFeatures.na.drop())
    stages += [assembler]
    cols = dataWithFeatures.columns
    # Create a Pipeline.
    pipeline = Pipeline(stages=stages)
    # Run the feature transformations.
    #  - fit() computes feature statistics as needed.
    #  - transform() actually transforms the features.
    pipelineModel = pipeline.fit(dataWithFeatures)
    dataWithFeatures = pipelineModel.transform(dataWithFeatures)
    # Keep relevant columns
    selectedcols = ["features_new"] + cols
    dataWithFeatures = dataWithFeatures.select(selectedcols)
    return dataWithFeatures

# Train model

In [8]:
def train_test_data(dataWithFeatures):
    """
    This function is to create test and train data by using randomSplit function from PySpark
    :param dataWithFeatures: preprocessed spark dataframe
    :return : trainingData,testData
    """
    ###split data into training and test sets. set seed for reproducibility
    (trainingData, testData) = dataWithFeatures.randomSplit([0.8, 0.2], seed=100)
    #To get rows and columns=shape() in Pandas
    print("Number of records for training: " + str(trainingData.count()))
    print("Number of records for evaluation: " + str(testData.count()))
    return trainingData,testData

In [9]:
def train_model(train_data):
    """
    This function is to train model on RF classifier
    :param train_data: training data obtained from train_test_data method
    :return rfModel: tarined model using RF classifier
    """
    # Train a RandomForest model.This also runs the indexers.
    rf = RandomForestClassifier(labelCol="Label", featuresCol="features_new", numTrees=200,maxDepth=25)  
    t= time.time()
    rfModel = rf.fit(train_data)
    elapsed_time = time.time() - t
    print(elapsed_time)
    #save model
    rfModel.save('BinaryModel')
    return rfModel

In [10]:
def results(rfModel):
    """
    This function will output model evaluations and its reuslts.
    """
    # Make predictions.
    y_predictionsRf = rfModel.transform(testData)
    # Evaluate model
    evaluatorRf = BinaryClassificationEvaluator(labelCol="Label")
    accuracyRf=evaluatorRf.evaluate(y_predictionsRf)
    outputrf=np.array(y_predictionsRf.select('prediction').collect())
    input_array=np.array(testData.select('Label').collect())
    print(input_array.shape,outputrf.shape)
    print("Accuracy of RandomForestClassifier = %g " % (accuracyRf))
    print ("Test Error in RandomForestClassifier = %g " % (1.0 - accuracyRf))
    print("Precision Score for RF model=%g"%(precision_score(input_array, outputrf, average='macro')))
    print("Recall Score for RF model=%g"%(recall_score(input_array, outputrf, average='macro') )) 
    print("F1 Score for RF model=%g"%(f1_score(input_array, outputrf, average='macro')))
    print("Benign vs Attack result classification_report")
    print(classification_report(input_array, outputrf,target_names=['Attack','Benign']))
    return input_array,outputrf
    

# Data viusaltion using Tsne uisng Binary mode for test data using true and predicted labels

In [13]:
#dataXX=dataWithFeatures.toPandas()
import time
import pickle
from sklearn.manifold import TSNE
#dataVisualisationBinaryMode=testData.toPandas()
#dataVisualisationBinaryMode.drop(['features', ' SYN Flag Count', ' PSH Flag Count',' ACK Flag Count','Fwd PSH Flags'],axis=1,inplace=True)
def tsneTrain(testData):
    dataVisualisationBinaryMode=testData.toPandas()
    dataVisualisationBinaryMode.drop(['features', ' SYN Flag Count', ' PSH Flag Count',' ACK Flag Count','Fwd PSH Flags'],axis=1,inplace=True)
    time_start = time.time()
    tsne = TSNE(n_components=2, verbose=1, perplexity=50,learning_rate=400.0,n_iter=1500)
    tsne_resultsRf = tsne.fit_transform(dataVisualisationBinaryMode[:80000])
    print ('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))
    with open('tsneTrainResults.npy','wb') as fout:
        pickle.dump(tsne_resultsRf,fout,pickle.DEFAULT_PROTOCOL)
    return tsne_resultsRf

In [14]:
%matplotlib inline
def plot(tsne_resultsRfPred,labelss,str):
    #target_ids = range(0,15)
    target_ids = range(0,2)
    font_size = 10
    Label=['Benign','Attack']
    from matplotlib import pyplot as plt
    #plt.figure(figsize=(6, 5))
    plt.figure(figsize=(30,10))
    #colors = 'r', 'g', 'b', 'c', 'm', 'y', 'k', 'orange', 'purple','dimgray','rosybrown','firebrick','maroon','khaki','indigo'
    colors = 'r', 'g'
    for i, c, label in zip(target_ids, colors, [_ for _ in target_ids]):
        plt.scatter(tsne_resultsRfPred[labelss==i, 0], tsne_resultsRfPred[labelss==i, 1], c=c, label=label,s=1.5)
    plt.title(str, fontsize=font_size,loc="center")
    plt.xlabel("Dimension 1", fontsize=font_size)
    plt.ylabel("Dimension 2", fontsize=font_size)
    plt.legend(loc=1,fontsize =font_size,bbox_to_anchor=(1.05, 1,), borderaxespad=-3.3)

# Below steps to run Binary mode wihtout sampling(Approach A)

In [1]:
#Step1: load data
dataset=data_loading("./CSVs/final.csv")
print("loading dataset done")

#Step2: plot lables
data_plot(dataset)
print("plot done")

#Step3: preprocess spark dataframe name dataset
dataWithFeatures=data_preprocessing(dataset)
print("preprocess done")

#Step4: create a feature indxer dataframe
dataFeature=vectorAssembler(dataWithFeatures)
print("create a feature indxer done")

#Step5: create train and test data
(trainingData, testData)=train_test_data(dataFeature)
print("create train and test data done")

#Step6: train Binarymode
rfModel=train_model(trainingData)
print("train Binarymode done")

#Step7: output the model results
input_array,outputrf=results(rfModel)
print("output the model results done")

#Step8: visualisation of binary mode
resultsTsne=tsneTrain(testData)
#flatted to convert from(x,1) to (x,)
flattenPredictedLabel=outputrf.round().flatten()
flattenTrueLabel=input_array.round().flatten()
plot(resultsTsne,flattenPredictedLabel[:80000],"Original Data Distribution on complete data ")
plot(resultsTsne,flattenTrueLabel[:80000],"Predicted Data Distribution on complete data using RF Classifier ")

Steps to run Binary mode with Sampling

In [None]:

#Step1: create data with Sampling
dataWithSampling=sampling(dataWithFeatures)
print("Step1: create data with Sampling done")

#Step2: create a feature indxer dataframe
dataFeatureWithSampling=vectorAssembler(dataWithSampling)
print("Step2: create a feature indxer done")

#Step3: create train and test data
(trainingDataWithSampling, testDataWithSampling)=train_test_data(dataFeatureWithSampling)
print("step3: create train and test data done")

#Step4: train Binarymode
rfModelWithSampling=train_model(trainingDataWithSampling)
print("Step4: train Binarymode done")

#Step5: output the model results
input_arrayWithSampling,outputrfWithSampling=results(rfModelWithSampling)
print("Step5: output the model results done")

# Preparing attack only data

In [7]:
def data_for_attack_Classification(dataset):
    """
    This function is to filter Benign labels from original dataset and create a pandas df with only attack labels
    :param dataset: a spark dataframe
    :retrun anomalyDataPandas: a pandas df with only attacks
    """
    featureList=[' Flow Duration', ' Fwd IAT Min', ' Bwd IAT Mean', ' Fwd IAT Mean','Init_Win_bytes_forward',' Subflow Fwd Bytes','Total Length of Fwd Packets',
      ' ACK Flag Count', ' Active Min', 'Active Mean',' Flow IAT Std','Init_Win_bytes_forward','Fwd PSH Flags',' SYN Flag Count',
      'Fwd Packets/s',' Bwd Packet Length Std','Total Length of Fwd Packets','Init_Win_bytes_forward',' Init_Win_bytes_backward','Total Length of Fwd Packets',
      'Total Length of Fwd Packets','Active Mean','Total Length of Fwd Packets',' Fwd Packet Length Mean',' Average Packet Size','Init_Win_bytes_forward', ' Bwd Packets/s', ' PSH Flag Count', ' Flow IAT Min', ' Fwd IAT Min', ' Flow IAT Mean']
    uniqueFeature=list(set(featureList))
    uniqueFeature.append('Label')
    filterData=dataset.select([c for c in dataset.columns if c in uniqueFeature])
    attackLabels=["DoS Slowhttptest",'Web Attack � Brute Force','Web Attack � Sql Injection','Web Attack � XSS',"SSH-Patator","DoS GoldenEye", "Heartbleed", "DoS Hulk", "DoS slowloris", "FTP-Patator", "Infiltration","Bot","PortScan","DDoS"]
    #to filter column based on column values
    anomalyData = filterData.where(col("Label").isin(attackLabels))
    anomalyData = anomalyData.filter(anomalyData.Label.isNotNull())
    anomalyData = anomalyData.na.fill(0.0)
    anomalyData.groupBy('Label').count().collect()
    anomalyDataPandas=anomalyData.toPandas()
    print("Shape of data with only attack records"+str(anomalyDataPandas.shape))
    #anomalyDataPandas.head(5)
    return anomalyDataPandas

In [8]:
def combineattacks(x):
    """
    This is to combine all web attcks lables inton one
    :param x: pandas df with attacks
    :return x: pandas df
    """
    if  x in ['Web Attack � Brute Force','Web Attack � XSS','Web Attack � Sql Injection']:
        return "Web-Attack"
    else : return x

In [9]:
def data_processing_for_Attacks(anomalyDataPandas):
    """
    This method preprocess the pandas df with attacks
    :param anomalyDataPandas: pandas df with attacks
    :return X,target,data_labels: features, labels, list of labels
    """
    #Filtering features X-label
    X=anomalyDataPandas.drop(['Label'],axis=1)
    #to convert dtype object to numeric
    X.dtypes.eq(object)
    c=X.columns[X.dtypes.eq(object)]
    X[c]=X[c].apply(pd.to_numeric, errors='coerce', axis=0)
    print("shape of features in attack data"+str(X.shape))
    #Finding missing values in dataframe 
    print("missing values in dataframe = %g " %(X.isnull().sum().sum()))
    #Filtering Lables Y-label
    data_labels =anomalyDataPandas['Label' ]
    names = data_labels.unique()
    #Encoding labels
    target = pd.get_dummies(pd.DataFrame(anomalyDataPandas['Label' ]))
    target=target.astype(float)
    return X,target,data_labels

In [10]:
#Splitting training and testing data
from sklearn.model_selection import train_test_split
#Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
def train_regressor_for_Attacks(X,target):
    """
    This method split data into test and train and returns trained model, test data
    :param X,target: features, labels
    :return regrModel,X_test,y_test: trained model, test data
    """
    X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=0.20, random_state=0,stratify=target)
    # show the distribution
    print('y_train class distribution')
    print(y_train.shape)
    #instantiating the model
    time_start = time.time()
    regrModel = RandomForestRegressor(max_depth=50,min_samples_leaf=20,n_estimators=200, random_state=0,criterion='mse',n_jobs=-1)
    #Fitting the model on RF Reg
    regrModel.fit(X_train,y_train)
    print ('model training time elapsed for regression: {} seconds'.format(time.time()-time_start))
    return regrModel,X_test,y_test


In [11]:
def results_For_RegrAttacks(regr,y_test):
    """
    This method output model accuracy and its results
    """
    prediction_new=regr.predict(X_test)
    print("Precision Score for RF model without sampling=%g"%(precision_score(y_test,prediction_new.round(), average='macro')))
    print("Recall Score for RF model without sampling=%g"%(recall_score(y_test,prediction_new.round(), average='macro') )) 
    print("F1 Score for RF model macro=%g"%(f1_score(y_test, prediction_new.round(), average='macro')))
    print("without sampling Regressor classification_report")
    print(classification_report(y_test, prediction_new.round(),target_names=y_test.columns))
    return prediction_new

# Preparing attack data with Sampling

In [17]:
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
def sampling_with_Encoder(data_labels,X):
    """
    This method encode the labels, then uses Smote for over sampling and fit data on smote
    :param data_labels: list of labels
    :return X_smote_result, y_smote_result,X_test_smote, y_test_smote: data after using smote
    """
    le = preprocessing.LabelEncoder()
    le.fit(data_labels)
    print("Lables before encoding: ")
    print(le.classes_)
    y_withSampling=le.transform(data_labels)
    #print(y_withSampling.shape)
    print("Lables after encoding: ")
    print(le.inverse_transform([0,1, 2, 3, 4,5,6,7,8,9,10,11]))
    X_train_smote, X_test_smote, y_train_smote, y_test_smote = train_test_split(X, y_withSampling, test_size=0.20, random_state=0,stratify=y_withSampling)
    sm = SMOTE(random_state=42,n_jobs=-1,ratio=0.8)
    #ratio = 'auto'
    X_smote_result, y_smote_result =sm.fit_sample(X_train_smote, y_train_smote)
    from collections import Counter
    print("Lables values after sampling:")
    print(sorted(Counter(y_smote_result).items()))
    return X_smote_result, y_smote_result,X_test_smote, y_test_smote

In [13]:
def regrAttacksResults_withSampling(X_smote_result, y_smote_result,X_test_smote,y_test_smote):
    """
    This method is used to train model on using smote data and print the model reusls
    :param X_smote_result, y_smote_result,X_test_smote,y_test_smote: data after using smote
    """
    time_start = time.time()
    #instantiating the model
    regrSampling = RandomForestRegressor(max_depth=20,min_samples_leaf=10,n_estimators=100, random_state=0,criterion='mse',n_jobs=-1)
    #Fitting the model on RF Reg
    regrSampling.fit(X_smote_result, y_smote_result)
    print ('Training Time elapsed for Smote: {} seconds'.format(time.time()-time_start))
    y_predicted_WithSampling=regrSampling.predict(X_test_smote)
    print("Precision Score for RF model with sampling for regressor=%g"%
      (precision_score(y_test_smote,y_predicted_WithSampling.round(), average='macro')))
    print("Recall Score for RF model with sampling for regressor=%g"%
      (recall_score(y_test_smote,y_predicted_WithSampling.round(), average='macro')))
    print("F1  Score for RF model with sampling for regressor=%g"%(f1_score(y_test_smote, y_predicted_WithSampling.round(), average='macro')))
    print("with sampling classification_report")
    print(classification_report(y_test_smote, y_predicted_WithSampling.round()))
    return regrSampling

# Steps to perform Classifaction Attacks (Approach B)

In [14]:
#load data
dataset=data_loading("./CSVs/final.csv")
print("loading dataset done")

#prepare data for attack calssification
anomalyDataPandas=data_for_attack_Classification(dataset=dataset)
anomalyDataPandas["Label"] = anomalyDataPandas["Label"].map(lambda x : combineattacks(x))
print("attack data preparation done")

#preprocessing of attack data
X,target,data_labels=data_processing_for_Attacks(anomalyDataPandas)
print("preprocessing of attack data done")

#RF regressor model tarining and results
regrModel,X_test,y_test=train_regressor_for_Attacks(X,target)
prediction_regrValue=results_For_RegrAttacks(regrModel,y_test)
print("RF regressor model tarining and results")

loading dataset done
Shape of data with only attack records(471454, 23)
['DDoS' 'PortScan' 'Bot' 'Infiltration' 'Web-Attack' 'FTP-Patator'
 'SSH-Patator' 'DoS slowloris' 'DoS Slowhttptest' 'DoS Hulk'
 'DoS GoldenEye' 'Heartbleed']
shape of features in attack data(471454, 22)
missing values in dataframe = 0 
y_train class distribution
(377163, 12)
model training time elapsed for regression: 410.3259468078613 seconds


  'precision', 'predicted', average, warn_for)


Precision Score for RF model without sampling=0.829424
Recall Score for RF model without sampling=0.825442
F1 Score for RF model macro=0.827414
without sampling Regressor classification_report
                        precision    recall  f1-score   support

             Label_Bot       0.99      0.99      0.99       393
            Label_DDoS       1.00      1.00      1.00      8367
   Label_DoS GoldenEye       1.00      1.00      1.00      2059
        Label_DoS Hulk       1.00      1.00      1.00     46215
Label_DoS Slowhttptest       0.99      0.99      0.99      1100
   Label_DoS slowloris       0.99      0.98      0.98      1159
     Label_FTP-Patator       1.00      1.00      1.00      1588
      Label_Heartbleed       0.00      0.00      0.00         2
    Label_Infiltration       0.00      0.00      0.00         7
        Label_PortScan       1.00      1.00      1.00     31786
     Label_SSH-Patator       1.00      0.99      0.99      1179
      Label_Web-Attack       0.99     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [15]:
#Saving RF regressor model
import pickle
with open('regrRF.npy','wb') as fout:
        pickle.dump(regrModel,fout,pickle.DEFAULT_PROTOCOL)

Running approach B with Sampling

In [18]:
X_smote_result, y_smote_result,X_test_smote, y_test_smote=sampling_with_Encoder(data_labels,X)
regrSampling=regrAttacksResults_withSampling(X_smote_result, y_smote_result,X_test_smote,y_test_smote)

Lables before encoding: 
['Bot' 'DDoS' 'DoS GoldenEye' 'DoS Hulk' 'DoS Slowhttptest'
 'DoS slowloris' 'FTP-Patator' 'Heartbleed' 'Infiltration' 'PortScan'
 'SSH-Patator' 'Web-Attack']
Lables after encoding: 
['Bot' 'DDoS' 'DoS GoldenEye' 'DoS Hulk' 'DoS Slowhttptest'
 'DoS slowloris' 'FTP-Patator' 'Heartbleed' 'Infiltration' 'PortScan'
 'SSH-Patator' 'Web-Attack']


  if diff:


Lables values after sampling:
[(0, 147886), (1, 147886), (2, 147886), (3, 184858), (4, 147886), (5, 147886), (6, 147886), (7, 147886), (8, 147886), (9, 147886), (10, 147886), (11, 147886)]
Training Time elapsed for Smote: 1089.8444476127625 seconds
Precision Score for RF model with sampling for regressor=0.928491
Recall Score for RF model with sampling for regressor=0.973131
F1  Score for RF model with sampling for regressor=0.926602
with sampling classification_report
             precision    recall  f1-score   support

          0       0.99      1.00      0.99       393
          1       1.00      1.00      1.00      8367
          2       0.99      1.00      0.99      2059
          3       1.00      1.00      1.00     46215
          4       0.99      0.99      0.99      1100
          5       0.99      1.00      0.99      1159
          6       1.00      1.00      1.00      1588
          7       0.20      1.00      0.33         2
          8       1.00      0.71      0.83      

In [None]:
#Saving RF regressor with Sampling model
import pickle
with open('regrRFSampling.npy','wb') as fout:
        pickle.dump(regrSampling,fout,pickle.DEFAULT_PROTOCOL)