In [39]:

# Import necessary libraries
import pandas as pd
import numpy as np
import os
path = os.path.join("data" , "malware_detection_dataset.csv")
df = pd.read_csv(path , index_col = False)
# Display the first few rows of the dataset

df.head()

import numpy as np
from sklearn.metrics import accuracy_score

# Gini impurity function
import numpy as np

def gini_impurity(classes):
    total = np.sum(classes)
    if total == 0:
        return 0  # If no samples, return 0 impurity
    probs = np.array(classes) / total
    return 1 - np.sum(probs ** 2)

def split_data(data, feature, value):
    data = np.array(data)
    left = data[data[:, feature] <= value]
    right = data[data[:, feature] > value]
    return left.tolist(), right.tolist()


# Function to calculate Gini for a split
def calculate_split_gini(left, right):
    n = len(left) + len(right)
    if n == 0:
        return 0  # Avoid division by zero
    
    # Calculate Gini impurity for non-empty nodes
    if len(left) == 0:
        gini_left = 0
    else:
        gini_left = gini_impurity([row[-1] for row in left])
    
    if len(right) == 0:
        gini_right = 0
    else:
        gini_right = gini_impurity([row[-1] for row in right])
    
    # Weighted average of Gini impurities
    return (len(left) / n) * gini_left + (len(right) / n) * gini_right

# Function to find the best split
import random

def find_best_split(data, n_samples=10):
    best_gini = float('inf')
    best_split = None
    n_features = len(data[0]) - 1  # Assuming the last column is the label
    
    for feature in range(n_features):
        feature_values = [row[feature] for row in data]
        
        # Randomly sample `n_samples` values from the feature's unique values
        values = random.sample(list(set(feature_values)), min(len(set(feature_values)), n_samples))
        
        for value in values:
            left, right = split_data(data, feature, value)
            gini = calculate_split_gini(left, right)
            
            if gini < best_gini:
                best_gini = gini
                best_split = (feature, value)
    
    return best_split


# Function to check if a node is pure (all samples have the same class)
def is_pure(data):
    classes = [row[-1] for row in data]
    return len(set(classes)) == 1

# Function to create a leaf node (return the majority class)
def leaf_node(data):
    classes = [row[-1] for row in data]
    return max(set(classes), key=classes.count)

# Recursive function to build the decision tree
def build_tree(data, max_depth, min_samples_split=2, depth=0):
    # Stop if max depth is reached, the node is pure, or there's not enough data to split
    if depth == max_depth or is_pure(data) or len(data) < min_samples_split:
        return leaf_node(data)
    
    # Find the best split
    feature, value = find_best_split(data)
    
    # Split the data
    left, right = split_data(data, feature, value)
    
    # Recursively build the left and right subtrees
    return {
        'feature': feature,
        'value': value,
        'left': build_tree(left, max_depth, min_samples_split, depth + 1),
        'right': build_tree(right, max_depth, min_samples_split, depth + 1)
    }


# Function to predict for a single sample
def predict_one(tree, sample):
    if isinstance(tree, dict):
        feature = tree['feature']
        value = tree['value']
        if sample[feature] <= value:
            return predict_one(tree['left'], sample)
        else:
            return predict_one(tree['right'], sample)
    else:
        return tree

# Function to predict for the entire dataset
def predict(tree, data):
    return [predict_one(tree, sample) for sample in data]



# Prepare the dataset by separating features and target labels
X = df.drop(columns=['is_malicious' , 'Unnamed: 0'])
y = df['is_malicious']

# Convert the dataframe into a list of lists for manual implementation
data = X.values.tolist()
labels = y.values.tolist()

# Append the labels to the data
for i in range(len(data)):
    data[i].append(labels[i])

# Build the decision tree using a maximum depth of 3 for simplicity
tree = build_tree(data, max_depth=3)

# Make predictions using the built tree on the original dataset
predictions = predict(tree, X.values.tolist())

# Evaluate the accuracy of our model
accuracy = accuracy_score(y, predictions)

# Output the accuracy of the model
print(f"Model Accuracy: {accuracy * 100:.2f}%")


Model Accuracy: 99.89%
