In [1]:
import pandas as pd
import numpy as np
import chardet
import os
from skimage.transform import resize
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
import cv2

## Ransomware Samples

In [2]:
with open('Updated_Samples_23_features.csv', 'rb') as f:
    result = chardet.detect(f.read())  # or readline if the file is large


df = pd.read_csv('Updated_Samples_23_features.csv', encoding=result['encoding'])


df.head(10)

Unnamed: 0,generic,apistats,processes,processtree,summary,Directory_Label
0,[{'process_path': 'C:\\Users\\Administrator\\A...,"{'664': {'CreateToolhelp32Snapshot': 1, 'LdrUn...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 516, 'process_name': ...",{'regkey_read': ['HKEY_LOCAL_MACHINE\\SOFTWARE...,8Base
1,[{'process_path': 'C:\\Windows\\System32\\lsas...,{},[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 504, 'process_name': ...",{},BianLian
2,[{'process_path': 'C:\\Users\\Administrator\\A...,"{'1756': {'GetNativeSystemInfo': 1, 'DeviceIoC...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 516, 'process_name': ...",{'file_created': ['C:\\Users\\Administrator\\D...,BlackBasta
3,[{'process_path': 'C:\\Windows\\System32\\lsas...,"{'2424': {'LdrUnloadDll': 2, 'DeviceIoControl'...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 512, 'process_name': ...","{'file_recreated': ['\\Device\\KsecDD'], 'regk...",BlackCat
4,[{'process_path': 'C:\\Users\\Administrator\\A...,"{'2684': {'GetForegroundWindow': 8, 'NtAllocat...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 508, 'process_name': ...",{'regkey_read': ['HKEY_LOCAL_MACHINE\\SYSTEM\\...,CL0P
5,[{'process_path': 'C:\\Windows\\System32\\lsas...,"{'2836': {'LdrUnloadDll': 3, 'CoUninitialize':...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 504, 'process_name': ...",{'file_recreated': ['\\??\\MountPointManager']...,MedusaLocker
6,[{'process_path': 'C:\\Windows\\SysWOW64\\cmd....,"{'2316': {'NtDuplicateObject': 1, 'DeviceIoCon...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 508, 'process_name': ...","{'file_recreated': ['\\Device\\KsecDD'], 'regk...",NoEscape
7,[{'process_path': 'C:\\Users\\Administrator\\A...,"{'2232': {'NtDuplicateObject': 7, 'LdrUnloadDl...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 508, 'process_name': ...","{'dll_loaded': ['kernel32', 'api-ms-win-core-l...",Play
8,[{'process_path': 'C:\\Users\\Administrator\\A...,"{'800': {'LdrUnloadDll': 1, 'NtDeviceIoControl...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 500, 'process_name': ...",{'file_opened': ['C:\\Windows\\System32\\en-US...,Royal


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   generic          9 non-null      object
 1   apistats         9 non-null      object
 2   processes        9 non-null      object
 3   processtree      9 non-null      object
 4   summary          9 non-null      object
 5   Directory_Label  9 non-null      object
dtypes: object(6)
memory usage: 564.0+ bytes


In [4]:
df.rename(columns = {"Directory_Label":'label'}, inplace=True)


In [5]:
df.head()

Unnamed: 0,generic,apistats,processes,processtree,summary,label
0,[{'process_path': 'C:\\Users\\Administrator\\A...,"{'664': {'CreateToolhelp32Snapshot': 1, 'LdrUn...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 516, 'process_name': ...",{'regkey_read': ['HKEY_LOCAL_MACHINE\\SOFTWARE...,8Base
1,[{'process_path': 'C:\\Windows\\System32\\lsas...,{},[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 504, 'process_name': ...",{},BianLian
2,[{'process_path': 'C:\\Users\\Administrator\\A...,"{'1756': {'GetNativeSystemInfo': 1, 'DeviceIoC...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 516, 'process_name': ...",{'file_created': ['C:\\Users\\Administrator\\D...,BlackBasta
3,[{'process_path': 'C:\\Windows\\System32\\lsas...,"{'2424': {'LdrUnloadDll': 2, 'DeviceIoControl'...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 512, 'process_name': ...","{'file_recreated': ['\\Device\\KsecDD'], 'regk...",BlackCat
4,[{'process_path': 'C:\\Users\\Administrator\\A...,"{'2684': {'GetForegroundWindow': 8, 'NtAllocat...",[{'process_path': 'C:\\Windows\\System32\\lsas...,"[{'track': False, 'pid': 508, 'process_name': ...",{'regkey_read': ['HKEY_LOCAL_MACHINE\\SYSTEM\\...,CL0P


# Preprocessing 

In [6]:
# Define textual columns
text_features = ['generic', 'apistats', 'processes', 'processtree', 'summary']

In [7]:
# Iterate over each row in the dataframe
for index, row in df.iterrows():
    # Handle textual columns
    vectorizer = TfidfVectorizer(stop_words=None)
    encoded_texts = []
    for feature in text_features:
        text = row[feature]
        if text.strip():  # Check if the text is not empty or contains only whitespace
            try:
                # Use TF-IDF vectorization for each textual feature
                encoded_text = vectorizer.fit_transform([text]).toarray()
                encoded_texts.append(encoded_text)
            except ValueError:
                print(f"Skipping feature {feature} for index {index} due to ValueError.")
                continue
                                 
    encoded_data = np.concatenate(encoded_texts, axis=1)

    # Create normalized grayscale image
    image_width = 256
    image_height = 256
    image = np.zeros((image_height, image_width), dtype=np.uint8)
    encoded_data_normalized = (encoded_data - encoded_data.min()) / (encoded_data.max() - encoded_data.min())

    # Resize the encoded data to match image dimensions
    resized_data = resize(encoded_data_normalized, (image_height, image_width))

    # Assign the resized data to the image
    image[:, :] = (resized_data * 255).astype(np.uint8)

    # Get the label value from the dataframe
    label_value = row['label']  # Assuming 'label' is the name of the column containing label values

    # Save the image with a filename based on the label value
    filename = f"data_image_{label_value}.png"
    cv2.imwrite(filename, image)
    print(f"Image saved: {filename}")

Image saved: data_image_8Base.png
Skipping feature apistats for index 1 due to ValueError.
Skipping feature summary for index 1 due to ValueError.
Image saved: data_image_BianLian.png
Image saved: data_image_BlackBasta.png
Image saved: data_image_BlackCat.png
Image saved: data_image_CL0P.png
Image saved: data_image_MedusaLocker.png
Image saved: data_image_NoEscape.png
Image saved: data_image_Play.png
Image saved: data_image_Royal.png


### Prepare images

In [None]:
# Define your image directory
image_dir = 'Ransom_Images_sample/'

# Get list of all image files in the directory
image_files = os.listdir(image_dir)

# Initialize an empty list to store the processed images
images_ready = []

# Process each image file
for image_file in image_files:
    # Load the image
    img = cv2.imread(os.path.join(image_dir, image_file))

    # Resize the image to 28x28
    img_resized = cv2.resize(img, (28, 28))

    # Convert the image to grayscale
    img_gray = cv2.cvtColor(img_resized, cv2.COLOR_BGR2GRAY)

    # Expand dimensions so the image has a batch size and color channel
    img_ready = np.expand_dims(np.expand_dims(img_gray, axis=0), axis=-1)

    # Add the processed image to the list
    images_ready.append(img_ready)

# Convert the list of processed images to a numpy array
images_ready = np.array(images_ready)

In [None]:
images_ready = np.squeeze(np.array(images_ready))

In [110]:
images_ready

array([[[[[101],
          [ 66],
          [ 58],
          ...,
          [ 86],
          [122],
          [127]],

         [[101],
          [ 66],
          [ 58],
          ...,
          [ 86],
          [122],
          [127]],

         [[101],
          [ 66],
          [ 58],
          ...,
          [ 86],
          [122],
          [127]],

         ...,

         [[101],
          [ 66],
          [ 58],
          ...,
          [ 86],
          [122],
          [127]],

         [[101],
          [ 66],
          [ 58],
          ...,
          [ 86],
          [122],
          [127]],

         [[101],
          [ 66],
          [ 58],
          ...,
          [ 86],
          [122],
          [127]]]],



       [[[[146],
          [ 90],
          [  0],
          ...,
          [  2],
          [ 86],
          [ 86]],

         [[146],
          [ 90],
          [  0],
          ...,
          [  2],
          [ 86],
          [ 86]],

         [[146],
          [ 