In [3]:
import pandas as pd
import time
import numpy as np
import math
import re
import random
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from scipy.sparse import csr_matrix, hstack, vstack

In [4]:
dft = pd.read_csv('/Users/AmitBer/Downloads/macro_competition/test_dataset_without_labels.csv', encoding='utf-16-le')

dft

Unnamed: 0,vba_code
0,"Private Sub getCIA(C As String, I As String, A..."
1,Private Sub Workbook_Open()\n Application.C...
2,'APMP\n'KILL\nPrivate Sub Document_Open()\n ...
3,Private Sub Workbook_Activate()\n Call AddM...
4,Private Sub CheckBox1_Click()\n\nEnd Sub\n\nPr...
...,...
10625,Private Sub CommandButton1_Click()\nDim s As A...
10626,Sub AutoOpen()\n\nMyMacro\n\nEnd Sub\n\nSub Do...
10627,Private Sub CommandButton1_Click()\n With A...
10628,Private Const FOSMgcqLTaUItxmhNrJ As String = ...


In [5]:
# Function to calculate Count of Variables
def count_variables(text):
    # Extract variable declarations using a regular expression
    variable_declarations = re.findall(r'\b(?:Dim|Private|Public|Static)\s+(\w+)', text)
    
    # Return the count of distinct variable declarations divided by the script length
    return len(set(variable_declarations)) / len(text) if len(text) > 0 else 0

# Function to add Count of Variables column
def add_count_of_variables_columnz(dataframe):
    return dataframe['vba_code'].apply(count_variables)



# Function to calculate Average Variable Assignment Length
def calculate_avg_variable_assignment_length(text):
    # Extract variable assignments using a regular expression
    variable_assignments = re.findall(r'\b(?:Set|Dim)\s+(\w+)\s*=\s*(".+?"|\w+)', text)

    # Calculate the average length of string variables
    total_length = sum(len(value) for _, value in variable_assignments if '"' in value)
    count = sum(1 for _, value in variable_assignments if '"' in value)

    # Avoid division by zero
    if count == 0:
        return 0

    avg_length = total_length / count
    return avg_length

def add_avg_variable_assignment_length_columnz(dataframe):
    return dataframe['vba_code'].apply(calculate_avg_variable_assignment_length)


def mal_word(text):
    unauthorized_patterns = ['Emoji','"Hacked!"','UserVersion',"'donwload",'Private Sub Sample()','On Error Resume Next']
    authorized = ['Private Sub ComboBox11_DropButtonClick()']
    for pattern in authorized:
        if pattern in text:
            return 0  # Detected unauthorized access

    for pattern in unauthorized_patterns:
        if pattern in text:
            return 1  # Detected unauthorized access

    return 0  # No unauthorized access detected

def add_mal_words_columnz(dataframe):
    return dataframe['vba_code'].apply(mal_word)


def check_hexadecimal_encoding(text):
    return 1 if re.search(r'\b[0-9a-fA-F]+\b', text) else 0

def check_base64_encoding(text):
    return 1 if re.search(r'\b(?:[A-Za-z0-9+/]{4})*(?:[A-Za-z0-9+/]{2}==|[A-Za-z0-9+/]{3}=)?\b', text) else 0

def add_check_hexadecimal_encoding_columnz(dataframe):
    return dataframe['vba_code'].apply(check_hexadecimal_encoding)

def add_check_base64_encoding_columnz(dataframe):
    return dataframe['vba_code'].apply(check_base64_encoding)

In [9]:
path = "/Users/AmitBer/Downloads/"

with open(f'{path}top_class.pkl', 'rb') as f:
    classifier = pickle.load(f)
    
with open(f'{path}vec.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)  

In [10]:
tfidf_vectorizer

In [11]:
# Fit and transform the data
test_tfidf = tfidf_vectorizer.transform(dft['vba_code'])


In [12]:

number_of_variables_test = np.array(add_count_of_variables_columnz(dft))

avg_variable_assignment_length_test = np.array(add_avg_variable_assignment_length_columnz(dft))

mal_words_test = np.array(add_mal_words_columnz(dft))

check_hexadecimal_encodingn_test = np.array(add_check_base64_encoding_columnz(dft))

check_base64_encodingn_test = np.array(add_check_hexadecimal_encoding_columnz(dft))


In [21]:
# Combine the matrices horizontally
test_combined = hstack([test_tfidf,
                     csr_matrix(number_of_variables_test.reshape(-1,1)),
                     csr_matrix(avg_variable_assignment_length_test.reshape(-1,1)),
                     csr_matrix(mal_words_test.reshape(-1,1)),
                     csr_matrix(check_hexadecimal_encodingn_test.reshape(-1,1)), 
                     csr_matrix(check_base64_encodingn_test.reshape(-1,1))])


test_predictions = classifier.predict(test_combined)




In [60]:
df = pd.DataFrame(test_predictions)

In [61]:
df.to_csv("test_predictions.csv")

In [62]:
df.columns = ['prediction']

In [63]:
# df.drop(df.columns[0], axis=1)

In [64]:
df

Unnamed: 0,prediction
0,white
1,white
2,mal
3,white
4,white
...,...
10625,white
10626,mal
10627,white
10628,mal
