In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.collections as mat
import os
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

features = pd.read_csv('./T2_data/malware_data.csv', header=None)
#print(features)
labels = pd.read_csv('./T2_data/malware_label.csv', header=None)
labels = labels.drop(0, axis=1)
labels = labels.rename(columns = {1:'label'})
#print(labels)
mal1_index = 17000
mal2_index = 21000
mal3_index = 12000
indices=[mal1_index,mal2_index,mal3_index]
mal_range = 50
mal_test_range = 30

def set_data(index1,index2):
    train_data = np.vstack([ features[mal1_index:mal1_index+mal_range][[index1,index2]].values, features[mal2_index:mal2_index+mal_range][[index1,index2]].values, features[mal3_index:mal3_index+mal_range][[index1,index2]].values ])
    train_data = pd.DataFrame(train_data)    
    full_data=np.array(train_data)
    train_labels = np.vstack([ labels[mal1_index:mal1_index+mal_range].values, labels[mal2_index:mal2_index+mal_range].values, labels[mal3_index:mal3_index+mal_range].values ])
    train_labels = pd.DataFrame(train_labels)
   
    train_data['labels'] = train_labels
    train_data = train_data.rename(columns={0:'x', 1:'y'})
   
    test_data = np.vstack([ features[mal1_index + mal_range:mal1_index+mal_range+mal_test_range][[index1,index2]].values, features[mal2_index+mal_range:mal2_index+mal_range+mal_test_range][[index1,index2]].values, features[mal3_index+mal_range:mal3_index+mal_range+mal_test_range][[index1,index2]].values ])
    test_data = pd.DataFrame(test_data)
    full_data=np.concatenate((full_data,np.array(test_data)))

    test_labels = np.vstack([ labels[mal1_index+mal_range:mal1_index+mal_range+mal_test_range].values, labels[mal2_index+mal_range:mal2_index+mal_range+mal_test_range].values, labels[mal3_index+mal_range:mal3_index+mal_range+mal_test_range].values ])
    test_labels = pd.DataFrame(test_labels)
    test_data['labels'] = test_labels
    test_data = test_data.rename(columns={0:'x', 1:'y'})

    return train_data,test_data,full_data

def classifier():
    
    # DO NOT MODIFY THIS CELL - this cell is splitting the data to provide a suitable subset of data to work with for this task.
    # If you change this cell your output will differ from that expected and could impact your mark.    

    train_data,test_data,full_data=set_data(0,1)
    
    train_data2=np.array(train_data)
    test_data2=np.array(test_data)

   ################# Centroids #####################
   #assign centroids
    centroids=[]
    for i in range(3):
        centroids.append(get_centroid(indices[i], mal_range))
    centroids=np.array(centroids)
    #print centroids
    print("Centroid Point for Dataset 1 is: ("+str(centroids[0,0])+" , "+str(centroids[0,1])+")")
    print("Centroid Point for Dataset 2 is: ("+str(centroids[1,0])+" , "+str(centroids[1,1])+")")
    print("Centroid Point for Dataset 3 is: ("+str(centroids[2,0])+" , "+str(centroids[2,1])+")")
    ###################################################
    plt.scatter(train_data['x'], train_data['y'],color='k')
    
    ###################### assign groups ############################
    group1,group2,group3,score=find_groups(centroids,test_data2,train_data2)
    #define the success score
    print("Score of Success of Classifier is: ("+str(score)+" )")
    #define the accuracy
    score=score/len(test_data2)
    print("Accuracy of Classifier is : ("+str(score)+" )") #(Total No. of correct prediction/Total No. of Prediction )
    ##############################################################
   
    centroids=np.array([np.mean(group1,axis=0), np.mean(group2,axis=0), np.mean(group3,axis=0)]) #update centroid

    ######### standarize the full data ##################
    full_data=standarize(full_data)
    print("Scaled Data:")
    print("------------")
    print(full_data)
    #################################################

    ############ Label Encode ###################
    frames = [train_data, test_data]  
    full_data_with_label = pd.concat(frames)
    labelEncode(full_data_with_label)
    print("Encoded Labels Data:")
    print("--------------------")
    print(full_data_with_label)
    #############################################

    ###### Split the data ############
    X_train, X_test, y_train, y_test=split_data(full_data_with_label,full_data_with_label['labels'])
    print("Splitted Data:")
    print("--------------")
    print ("X_train: ")
    print(X_train)
    print ("y_train:")
    print(y_train)
    print("X_test: ")
    print(X_test)
    print ("y_test: ")
    print(y_test)
    ###################

    ###### MPL Classifiier ############
    predicted_values=mpl_classifier(X_train,y_train,X_test)
    print("MPL Classifier Data:")
    print("--------------------")
    print("MLP Confusion Matrix:")
    print(confusion_matrix(y_test,predicted_values))
    print("MLP Classification Report:")
    print(classification_report(y_test,predicted_values))
    print("Accuracy:",metrics.accuracy_score(y_test, predicted_values))
    ##################################

    ###### Random Forest Classifiier ############
    rf_prediction=random_forest(X_train,y_train,X_test)
    print("Random Forest Classifier:")
    print("-------------------------")
    print("RF Confusion Matrix:")
    print(confusion_matrix(y_test,rf_prediction))
    print("RF Classification Report:")
    print(classification_report(y_test,rf_prediction))
    print("Accuracy:",metrics.accuracy_score(y_test, rf_prediction))
    ##################################

    ####Plot The Centroids Generated from Train Data##
    plt.scatter(centroids[0,0],centroids[0,1],marker='x',color='r')
    plt.scatter(centroids[1,0],centroids[1,1],marker='x',color='g')
    plt.scatter(centroids[2,0],centroids[2,1],marker='x',color='b')   
    plt.xlabel('Feature X')
    plt.ylabel('Feature Y')   
    ##################

    ####Plott Groups of Test Data##
    plt.scatter(group1[:,0], group1[:,1], color='r')
    plt.scatter(group2[:,0], group2[:,1], color='g')
    plt.scatter(group3[:,0], group3[:,1], color='b')
    ##################
    plt.show()
    ############ Best Accuracy ############
    print("Best Accuracy for Classifier:")
    print("-----------------------------")
    get_best_accuracy()
    #######################################

def get_centroid(mal_index,mal_range):
    points=features[mal_index:mal_index+mal_range][[0,1]].values
    x =  [p[0] for p in points]
    y =  [p[1] for p in points]
    centroid = [sum(x) / len(points), sum(y) / len(points)]
    return centroid

def find_groups(centroids,all_data,train_data):
    group1=[]
    group2=[]
    group3=[]
    groups=[group1,group2,group3]
    score=0
    for i in range(all_data.shape[0]):
        distance1=np.sqrt(np.abs(all_data[i , 0] - centroids[0,0]) ** 2 + np.abs(all_data[i,1] - centroids[0,1]) ** 2)
        distance2=np.sqrt(np.abs(all_data[i , 0] - centroids[1,0]) ** 2 + np.abs(all_data[i,1] - centroids[1,1]) ** 2)
        distance3=np.sqrt(np.abs(all_data[i , 0] - centroids[2,0]) ** 2 + np.abs(all_data[i,1] - centroids[2,1]) ** 2)
        distances=[distance1, distance2, distance3]
        index=np.argmin(distances)
        label_test=all_data[i , 2]
        label_centroid=train_data[index * mal_range,2]
        if(label_test == label_centroid):
            score+=1
        groups[index].append([all_data[i,0] , all_data[i,1]])
    group1=np.array(group1)
    group2=np.array(group2)
    group3=np.array(group3)
    return group1,group2,group3,score

def standarize(full_data):
    # the scaler object (model)
    scaler = StandardScaler()
    # fit and transform the data
    scaled_data = scaler.fit_transform(full_data)
    return scaled_data

def labelEncode(df):
    #create instance of label encoder
    lab = LabelEncoder()    
    #perform label encoding on 'team' column
    df['labels'] = lab.fit_transform(df['labels'])

def split_data(full_data,labels):
    X_train, X_test, y_train, y_test = train_test_split(full_data,labels,random_state=104, test_size=0.25, shuffle=True)    
    return X_train, X_test, y_train, y_test

def mpl_classifier(x_train,y_train,x_test):
    m1 = MLPClassifier(hidden_layer_sizes=(12, 13, 14), activation='relu', solver='adam', max_iter=2500)
    m1.fit(x_train, y_train.values.ravel())
    predicted_values = m1.predict(x_test)
    return predicted_values

def random_forest(x_train,y_train,x_test):
    #Create a Gaussian Classifier
    clf=RandomForestClassifier(n_estimators=100)
    #Train the model using the training sets y_pred=clf.predict(X_test)
    clf.fit(x_train,y_train)
    y_pred=clf.predict(x_test)
    return y_pred

def get_best_accuracy():
    accuracy=0
    for i in range(255):
        if(accuracy < 0.80):
            j=i+1
            train_data,test_data,full_data=set_data(i,j)
            frames = [train_data, test_data]  
            full_data_with_label = pd.concat(frames)
            labelEncode(full_data_with_label)
            X_train, X_test, y_train, y_test=split_data(full_data_with_label,full_data_with_label['labels'])
            ###### MPL Classifiier ############
            predicted_values=mpl_classifier(X_train,y_train,X_test)
            accuracy=metrics.accuracy_score(y_test, predicted_values)
            ##################################
        else:    
            print("Best Accuracy For ML is : ",accuracy)
            print("Used Feature Columns: "+str(i+1)+" and "+str(j+1))
            return
    print("Can't achieve 80% ")    

def main():
    classifier() 
if __name__ == "__main__":
    main()