<a href="https://colab.research.google.com/github/apolanco18/ml-final-project/blob/master/ml_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import argparse
import time
import csv
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn import metrics
import xgboost as xgb
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from google.colab import drive
from google.colab import auth
auth.authenticate_user()


class ParameterTuning(object):
  init = None


  def __init__(self,init):
    self.init = init

  
  def model_metrics(self,totalTime,yTrain,yHatTrain,yTest,yHat):
    totalTime = time.time() - totalTime

    trainAcc = accuracy_score(yTrain, yHatTrain)
    testAcc = accuracy_score(yTest, yHat)

    f1 = f1_score(yTest, yHat, average='macro')
    matrix = metrics.confusion_matrix(yTest, yHat)

    stats = {}
    stats['train accuracy'] = trainAcc
    stats['test accuracy'] = testAcc
    stats['macro f1 score'] = f1
    stats['time taken'] = totalTime

    return stats,matrix

  def print_stats(self,matrix,stats):
    # print(matrix)
    print('Train Accuracy {}'.format(stats['train accuracy']))
    print('Test Accuracy {}'.format(stats['test accuracy']))
    print('Macro F1 Score {}'.format(stats['macro f1 score']))
    print('Time Taken {}'.format(stats['time taken']))    
  

  def testModel(self,model,xTrain, xTest, yTrain, yTest): #use this to get data on an individual model

    yTrain = np.reshape(yTrain, (len(yTrain),1))
    
    
    model.fit(xTrain, yTrain)
    yHatTrain = model.predict(xTrain)

    totalTime = time.time()
    yHat = model.predict(xTest)


    ##-----------Metrics------------------##

    stats,matrix = self.model_metrics(totalTime,yTrain,yHatTrain,yTest,yHat)

    return stats,matrix

  def knn(self,nn,xTrain,yTrain,xTest,yTest):
    knn = KNeighborsClassifier(n_neighbors = nn)

    stats,matrix = self.testModel(knn,xTrain,xTest,yTrain,yTest)

    print("KNN: {}".format(nn))
    self.print_stats(matrix,stats)

    return nn,stats
  
  def write_knn_results(self,type,data):

    filepath = '/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/Results/KNN/knn_data_' + type + '.csv'

    with open(filepath, 'w') as f:
      fileWriter = csv.writer(f,delimiter=',',quotechar='"', quoting = csv.QUOTE_MINIMAL)
      fileWriter.writerow(['nn','train accuracy','test accuracy', 'macro f1 score','time taken'])
      for i in range(len(data)):
        fileWriter.writerow([data[i]['nn'],data[i]['stats']['train accuracy'],data[i]['stats']['test accuracy'],data[i]['stats']['macro f1 score'],data[i]['stats']['time taken']])
    f.close()



  def param_tuning_knn(self,type,xTrain,yTrain,xTest,yTest):

    listOfData = []

    for i in range(1,16):
      nn,stats = self.knn(i,xTrain,yTrain,xTest,yTest)

      finalStats = {}
      finalStats['nn'] = nn
      finalStats['stats'] = stats
      listOfData.append(finalStats)


    self.write_knn_results(type,listOfData)

  
  def logReg(self,penalty,xTrain,yTrain,xTest,yTest):
    logReg = LogisticRegression(penalty = penalty, solver = 'lbfgs', multi_class = 'auto')

    stats,matrix = self.testModel(logReg,xTrain,xTest,yTrain,yTest)

    print("Logistic Regression - Penalty: {}".format(penalty))
    self.print_stats(matrix,stats)

    return penalty,stats

  def write_logReg_results(self,type,data):

    filepath = '/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/Results/Logistic Regression/logReg_data_' + type + '.csv'

    with open(filepath, 'w') as f:
      fileWriter = csv.writer(f,delimiter=',',quotechar='"', quoting = csv.QUOTE_MINIMAL)
      fileWriter.writerow(['penalty','train accuracy','test accuracy', 'macro f1 score','time taken'])
      for i in range(len(data)):
        fileWriter.writerow([data[i]['penalty'],data[i]['stats']['train accuracy'],data[i]['stats']['test accuracy'],data[i]['stats']['macro f1 score'],data[i]['stats']['time taken']])
    f.close()

  def param_tuning_logReg(self,type,xTrain,yTrain,xTest,yTest):

    penalties = ['none','l2']

    listOfData = []

    for penalty in penalties:
      penalty,stats = self.logReg(penalty,xTrain,yTrain,xTest,yTest)

      finalStats = {}
      finalStats['penalty'] = penalty
      finalStats['stats'] = stats
      listOfData.append(finalStats)

    self.write_logReg_results(type,listOfData)

  def dt(self,criterion,maxDepth,minLeafSample,xTrain,yTrain,xTest,yTest):
    dt = DecisionTreeClassifier(criterion = criterion,max_depth = maxDepth, min_samples_leaf = minLeafSample)

    stats,matrix = self.testModel(dt,xTrain,yTrain,xTest,yTest)

    # self.print_stats(matrix,stats)

    return criterion,maxDepth,minLeafSample,stats

  def write_dt_results(self,type,data):

    filepath = '/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/Results/Decision Tree/dt_data_' + type + '.csv'

    with open(filepath, 'w') as f:
      fileWriter = csv.writer(f,delimiter=',',quotechar='"', quoting = csv.QUOTE_MINIMAL)
      fileWriter.writerow(['criterion','max depth','min leaf samples','train accuracy','test accuracy', 'macro f1 score','time taken'])
      for i in range(len(data)):
        fileWriter.writerow([data[i]['criterion'],data[i]['max depth'],data[i]['min leaf samples'],data[i]['stats']['train accuracy'],data[i]['stats']['test accuracy'],data[i]['stats']['macro f1 score'],data[i]['stats']['time taken']])
    f.close()

  def param_tuning_dt(self,type,xTrain,yTrain,xTest,yTest):

    listOfData = []

    for g in ['gini']:
      for maxDepth in range(1,26):
        for minLeafSample in range(20,101,20):
          criterion,maxDepth,minLeafSample,stats = self.dt(g,maxDepth,minLeafSample,xTrain,yTrain,xTest,yTest)

          finalStats = {}
          finalStats['criterion'] = criterion
          finalStats['max depth'] = maxDepth
          finalStats['min leaf samples'] = minLeafSample
          finalStats['stats'] = stats
          listOfData.append(finalStats)

    self.write_dt_results(type,listOfData)

  

  def naive(self,xTrain,yTrain,xTest,yTest):
    nv = GaussianNB()

    stats,matrix = self.testModel(nv,xTrain,xTest,yTrain,yTest)

    # self.print_stats(matrix,stats)

    return stats

  def write_naive_results(self,type,data):

    filepath = '/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/Results/Naive Bayes/naive_data_' + type + '.csv'

    with open(filepath, 'w') as f:
      fileWriter = csv.writer(f,delimiter=',',quotechar='"', quoting = csv.QUOTE_MINIMAL)
      fileWriter.writerow(['train accuracy','test accuracy', 'macro f1 score','time taken'])
      for i in range(len(data)):
        fileWriter.writerow([data[i]['stats']['train accuracy'],data[i]['stats']['test accuracy'],data[i]['stats']['macro f1 score'],data[i]['stats']['time taken']])
    f.close()


  def param_tuning_naive(self,type,xTrain,yTrain,xTest,yTest):
    listOfData = []


    stats = self.naive(xTrain,yTrain,xTest,yTest)

    finalStats = {}
    finalStats['stats'] = stats
    listOfData.append(finalStats)

    self.write_naive_results(type,listOfData)

  
  def write_neural_results(self,type,data):

    filepath = '/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/Results/Neural Network/neuralN_data_' + type + '.csv'

    with open(filepath, 'w') as f:
      fileWriter = csv.writer(f,delimiter=',',quotechar='"', quoting = csv.QUOTE_MINIMAL)
      fileWriter.writerow(['train accuracy','test accuracy', 'macro f1 score','time taken'])
      for i in range(len(data)):
        fileWriter.writerow([data[i]['stats']['train accuracy'],data[i]['stats']['test accuracy'],data[i]['stats']['macro f1 score'],data[i]['stats']['time taken']])
    f.close()

  
  def rf(self,numTrees,maxDepth,xTrain,yTrain,xTest,yTest):
    rf = RandomForestClassifier(n_estimators= numTrees, max_depth= maxDepth)

    stats,matrix = self.testModel(rf,xTrain, xTest, yTrain, yTest)

    return numTrees, maxDepth, stats
  
  def write_rf_results(self,type,data):

    filepath = '/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/Results/Random Forest/rf_data_' + type + '.csv'

    with open(filepath, 'w') as f:
      fileWriter = csv.writer(f,delimiter=',',quotechar='"', quoting = csv.QUOTE_MINIMAL)
      fileWriter.writerow(['number of trees','max depth','train accuracy','test accuracy', 'macro f1 score','time taken'])
      for i in range(len(data)):
        fileWriter.writerow([data[i]['number of trees'],data[i]['max depth'],data[i]['stats']['train accuracy'],data[i]['stats']['test accuracy'],data[i]['stats']['macro f1 score'],data[i]['stats']['time taken']])
    f.close()  

  def param_tuning_rf(self,type,xTrain,yTrain,xTest,yTest):

    listOfData = []


    for i in range(100,201,25):
      for maxDepth in range(2,26,2):
          numTrees,maxDepth,stats = self.rf(i,maxDepth,xTrain,yTrain,xTest,yTest)

          finalStats = {}
          finalStats['number of trees'] = numTrees
          finalStats['max depth'] = maxDepth
          finalStats['stats'] = stats
          listOfData.append(finalStats)

    self.write_rf_results(type,listOfData)

  
  def xgb(self,eta,maxDepth,esti,xTrain,yTrain,xTest,yTest):
    booster = xgb.XGBClassifier(max_depth = maxDepth,eta = eta, n_estimators = esti)

    stats,matrix = self.testModel(booster,xTrain, xTest, yTrain, yTest)

    return eta,maxDepth,esti,stats

  def write_xgb_results(self,type,data):

    filepath = '/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/Results/XGB Boost/xgb_data_' + type + '.csv'

    with open(filepath, 'w') as f:
      fileWriter = csv.writer(f,delimiter=',',quotechar='"', quoting = csv.QUOTE_MINIMAL)
      fileWriter.writerow(['eta','max depth','number of estimators','train accuracy','test accuracy', 'macro f1 score','time taken'])
      for i in range(len(data)):
        fileWriter.writerow([data[i]['eta'],data[i]['max depth'],data[i]['number of estimators'],data[i]['stats']['train accuracy'],data[i]['stats']['test accuracy'],data[i]['stats']['macro f1 score'],data[i]['stats']['time taken']])
    f.close()  

  def param_tuning_xgb(self,type,xTrain,yTrain,xTest,yTest):

    listOfData = []


    for eta in [.1,.2,.3,.4]:
      for maxDepth in range(2,8,2):
          for esti in range(5,8,1):
            eta,maxDepth,esti,stats = self.xgb(eta,maxDepth,esti,xTrain,yTrain,xTest,yTest)

            finalStats = {}
            finalStats['eta'] = eta
            finalStats['max depth'] = maxDepth
            finalStats['number of estimators'] = esti
            finalStats['stats'] = stats
            listOfData.append(finalStats)

    self.write_xgb_results(type,listOfData)   
          

  def test_all_models(self,val,type,xTrain,yTrain,xTest,yTest):
    if val == 1:
      self.param_tuning_knn(type,xTrain,yTrain,xTest,yTest)
    elif val == 2:
      self.param_tuning_logReg(type,xTrain,yTrain,xTest,yTest)
    elif val == 3:
      self.param_tuning_dt(type,xTrain,yTrain,xTest,yTest)
    elif val == 4:
      self.param_tuning_naive(type,xTrain,yTrain,xTest,yTest)
    elif val == 5:
      self.param_tuning_rf(type,xTrain,yTrain,xTest,yTest)
    elif val == 6:
      self.param_tuning_xgb(type,xTrain,yTrain,xTest,yTest)
    
      
    

def main():
    """
    Main file to run from the command line.
    """
    # set up the program to take in arguments from the command line
    np.random.seed(0)

    drive.mount('/content/drive/')

    xTrain = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTrain.npy', encoding = 'bytes')
    xTest = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTest.npy', encoding = 'bytes')

    xTrainScale = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTrainScale.npy', encoding = 'bytes')
    xTestScale = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTestScale.npy', encoding = 'bytes')

    xTrainFeatElim = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTrainFeatElim.npy', encoding = 'bytes')
    xTestFeatElim = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTestFeatElim.npy', encoding = 'bytes')

    xTrainPCA = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTrainPCA.npy', encoding = 'bytes')
    xTestPCA = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTestPCA.npy', encoding = 'bytes')
    

    yTrain = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/yTrain.npy', encoding = 'bytes')
    yTest = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/yTest.npy', encoding = 'bytes')


    tuning = ParameterTuning('Start')


    types = ['norm','scale','feat-elim','pca']
    data = [[xTrain,xTest],[xTrainScale,xTestScale],[xTrainFeatElim,xTestFeatElim],[xTrainPCA,xTestPCA]]

    

    '''
      KNN Model Parameter Tuning

    '''
    # for i in range(2,3):
    #   tuning.test_all_models(1,types[i],data[i][0],yTrain,data[i][1],yTest)

    '''
      Logistic Regression Model Parameter Tuning
    
    '''
    
    # for i in range(0,4):
    #   tuning.test_all_models(2,types[i],data[i][0],yTrain,data[i][1],yTest)

    '''
      Decision Tree Model Parameter Tuning
    
    '''
    # for i in range(0,4):
    #   tuning.test_all_models(3,types[i],data[i][0],yTrain,data[i][1],yTest)

    '''
      Naives Bayes Model Parameter Tuning
    
    '''
    # for i in range(0,4):
    #   tuning.test_all_models(4,types[i],data[i][0],yTrain,data[i][1],yTest)

    '''
      XGB Booster Model Parameter Tuning
    
    '''

    # for i in range(0,4):
    #   tuning.test_all_models(6,types[i],data[i][0],yTrain,data[i][1],yTest)



    '''
      Neural Network Model Parameter Tuning
    
    '''
    
    # for i in range(0,4):
    #   neuralN = MLPClassifier()

    #   stats,matrix = tuning.testModel(neuralN,data[i][0], data[i][1], yTrain, yTest)
      
    #   temp = []
    #   temp2 = {}
    #   temp2['stats'] = stats
    #   temp.append(temp2)
    #   tuning.write_neural_results(types[i],temp)

    '''
      Random Forest Model Parameter Tuning
    
    '''

    # for i in range(0,4):
    #   tuning.test_all_models(5,types[i],data[i][0],yTrain,data[i][1],yTest)
    

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  'precision', 'predicted', average, warn_for)


In [0]:
import numpy as np
from google.colab import drive
from google.colab import auth
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import seaborn as sns
import matplotlib.pyplot as plt
auth.authenticate_user()
import xgboost as xgb

class Preprocess(object):

  def __init__(self):
    print("hello world")


  # Scales the data so the each feature can be compared to each other
  def scale_data(self, xFeat):
    scaler = StandardScaler()

    return scaler.fit_transform(xFeat)

  
  # Calculate Pearson Correlation Matrix 
  def pearsonMatrix(self,xFeat):
      featCol = []

      for i in range(len(xFeat[0])):
        featCol.append(xFeat[:,i])
      
      pearsonMatrix = np.corrcoef(featCol)

      return pearsonMatrix

  
  def graphMatrix(self,matrix):
    sns.set()

    ax = sns.heatmap(matrix)

    plt.show()

  # Follows the idea of Feature elimination where a pearson correlation matrix is calculated and then the highly correlated features are dropped reduce the number of features
  def feature_elimination(self,xFeat):

    matrix = self.pearsonMatrix(xFeat)
    
    deleteColBool = np.full( (1, len(xFeat[0])), True)

    deleteColBool = deleteColBool[0]

    for i in range(len(xFeat[0])):
      for t in range(i+1, len(xFeat[0])):
        if (abs(matrix[i,t]) >= .75):
          deleteColBool[t] = False

    deleteColIndex = []

    for i in range(len(deleteColBool)):
        if (deleteColBool[i] == False):
            deleteColIndex.append(i)

    xFeat = np.delete(xFeat,deleteColIndex,axis = 1)


    return xFeat


  def pca(self,xFeatTrain,xFeatTest):
    model = PCA(.95)

    model.fit(xFeatTrain)

    self.graph_variance_pca(model)

    xTrainReduce = model.transform(xFeatTrain)
    xTestReduce = model.transform(xFeatTest)

    return xTrainReduce,xTestReduce

  def graph_variance_pca(self,model):
    plt.plot(np.cumsum(model.explained_variance_ratio_))
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance')
  
  def graph_pca_components(self,xFeat,yFeat):
    xAxis = xFeat[:,0]
    yAxis = xFeat[:,1]

    labels = yFeat 

    colorDict = {0:'red',1:'green',2:'black',3:'yellow',4:'#1004fb',5:'cyan',6:'magenta',7:'green',8:'#e1341e',9:'#3820df',10:'#d52ad1'}

    label = {0:'Benign',1:'Ack',2:'Combo',3:'Junk',4:'Scan 1',5:'Scan',6:'Syn 1',7:'Tcp',8:'Udp 1',9:'Udp',10:'Udp Plain 1'}

    marker = {0:'*',1:'o',2:'.',3:'v',4:'1',5:'8',6:'s',7:'p',8:'P',9:'h',10:'x'}

    alpha = {0:.3,1:.5,2:.7,3:.9,4:1.1,5:1.3,6:1.5,7:1.7,8:1.9,9:2.1,10:2.3}

    fig,ax = plt.subplots(figsize = (7,5))
    fig.patch.set_facecolor('white')

    for l in np.unique(labels):
      if l == 4:
        break
      ix = np.where(labels == l) 
      ax.scatter(xAxis[ix],yAxis[ix], c = colorDict[l],s = 40, label = label[l],marker = marker[l], alpha = .5)

    plt.xlabel("First Principal Component",fontsize=14)
    plt.ylabel("Second Principal Component",fontsize=14)
    plt.legend()
    plt.show()

  
  def write_data_file(self,name,data):
    filepath = '/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/' + name + '.npy'
    with open(filepath, 'w') as f:
      np.save(f.name,data)
      f.close()



def main():
  drive.mount('/content/drive/')

  xTrain = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTrain.npy', encoding = 'bytes')
  xTest = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/xTest.npy', encoding = 'bytes')
  yTrain = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/yTrain.npy', encoding = 'bytes')
  yTest = np.load('/content/drive/My Drive/19-20SchoolYear/CS334/Machine Learning Project/yTest.npy', encoding = 'bytes')


  preprocess = Preprocess()


  xTrainScale = preprocess.scale_data(xTrain)
  xTestScale = preprocess.scale_data(xTest)

  # Training and Test data features reduce using pearson correlation matrix for feature elimination
  xTrainFeatElim = preprocess.feature_elimination(xTrainScale)
  xTestFeatElim = preprocess.feature_elimination(xTestScale)


  # preprocess.graphMatrix(preprocess.pearsonMatrix(xTrainScale))
  # preprocess.graphMatrix(preprocess.pearsonMatrix(xTestScale))

  # preprocess.graphMatrix(preprocess.pearsonMatrix(xTrainFeatElim))
  # preprocess.graphMatrix(preprocess.pearsonMatrix(xTestFeatElim))

  xTrainPCA,xTestPCA = preprocess.pca(xTrainScale,xTestScale)

  preprocess.write_data_file('xTrainScale',xTrainScale)
  preprocess.write_data_file('xTestScale',xTestScale)

  preprocess.write_data_file('xTrainFeatElim',xTrainFeatElim)
  preprocess.write_data_file('xTestFeatElim',xTestFeatElim)
  
  preprocess.write_data_file('xTrainPCA',xTrainPCA)
  preprocess.write_data_file('xTestPCA',xTestPCA)


  
  

  # preprocess.graph_pca_components(xTrainPCA,yTrain)


if __name__ == "__main__":
  main()



KeyboardInterrupt: ignored

In [0]:
!pip install sklearn