# Laboratorio 4 - Familias de Malware
### Universidad del Valle de Guatemala
### Security Data Science

Andrés de la Roca - 20332

### I. Creacion del dataset / Extraccion de caractersiticas

In [112]:
import pefile
import os
import pandas as pd

#PE Header data
def extract_pe_header_features(file_path):
    pe = pefile.PE(file_path)

    machine_type = pe.FILE_HEADER.Machine
    number_of_sections = pe.FILE_HEADER.NumberOfSections
    time_date_stamp = pe.FILE_HEADER.TimeDateStamp
    characteristics = pe.FILE_HEADER.Characteristics

    return machine_type, number_of_sections, time_date_stamp, characteristics

# Section features
def extract_section_features(file_path):
    pe = pefile.PE(file_path)

    section_features = []
    for section in pe.sections:
        section_name = section.Name.decode('utf-8').rstrip('\x00')
        size_section = section.SizeOfRawData
        characteristics = section.Characteristics

        section_features.append((section_name, size_section, characteristics))

    return section_features

# Imported functions
def extract_imported_functions(file_path):
    pe = pefile.PE(file_path)

    imported_functions = []
    for entry in pe.DIRECTORY_ENTRY_IMPORT:
        dll = entry.dll.decode('utf-8')
        for function in entry.imports:
            if function.name is not None:
                imported_functions.append(function.name.decode('utf-8'))

    return imported_functions

# Exported functions
def extract_exported_functions(file_path):
    pe = pefile.PE(file_path)

    exported_functions = []
    if hasattr(pe, 'DIRECTORY_ENTRY_EXPORT'):
        for exp in pe.DIRECTORY_ENTRY_EXPORT.symbols:
            exported_functions.append(exp.name.decode('utf-8'))

    return exported_functions

# Dataset creation function
def create_dataset(path):
    dataset = []

    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)

        try:

            if os.path.isfile(file_path):
                pe_header_features = extract_pe_header_features(file_path)
                section_features = extract_section_features(file_path)
                imported_functions = extract_imported_functions(file_path)
                exported_functions = extract_exported_functions(file_path)

                entry = {
                    'File_Path': file_path,
                    'PE_Header_Features': pe_header_features,
                    'Section_Features': section_features,
                    'Imported_Functions': imported_functions,
                    'Exported_Functions': exported_functions
                }

                dataset.append(entry)
        except pefile.PEFormatError:
            print('Error reading file: {}'.format(file_path))
            continue

    
    data = pd.DataFrame(dataset)

    return data

In [113]:
path = "./MALWR"

mw_data = create_dataset(path)

print(mw_data.head())


Error reading file: ./MALWR/.DS_Store
                                       File_Path          PE_Header_Features  \
0  ./MALWR/AAAz2E1B6940985A23E5639450F8391820655  (332, 3, 1319015770, 8462)   
1   ./MALWR/NBV_8B75BCBFF174C25A0161F30758509A44   (332, 3, 1242321160, 271)   
2       ./MALWR/65018CD542145A3792BA09985734C12A   (332, 3, 1195429813, 271)   
3       ./MALWR/8442AE37B91F279A9F06DE4C60B286A3   (332, 3, 1263576056, 271)   
4  ./MALWR/AL65_DB05DF0498B59B42A8E493CF3C10C578  (332, 3, 1319015949, 8462)   

                                    Section_Features  \
0  [(UPX0, 0, 3758096512), (UPX1, 342528, 3758096...   
1  [(UPX0, 0, 3758096512), (UPX1, 4096, 375809644...   
2  [(UPX0, 0, 3758096512), (UPX1, 4096, 375809644...   
3  [(UPX0, 0, 3758096512), (UPX1, 3584, 375809644...   
4  [(UPX0, 0, 3758096512), (UPX1, 342528, 3758096...   

                                  Imported_Functions  \
0  [RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...   
1  [LoadLibraryA, ExitProcess, G

### II. Preprocesamiento de datos

In [114]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

# 1. Normalización para las características numéricas del PE Header
numeric_features = mw_data['PE_Header_Features']
numeric_features = pd.DataFrame(numeric_features.to_list(), columns=['Machine_Type', 'Number_of_Sections', 'TimeDateStamp', 'Characteristics'])
scaler = MinMaxScaler()
mw_data[['Machine_Type', 'Number_of_Sections', 'TimeDateStamp', 'Characteristics']] = scaler.fit_transform(numeric_features)

mw_data.drop(columns=['PE_Header_Features'], inplace=True)

mw_data.head()

Unnamed: 0,File_Path,Section_Features,Imported_Functions,Exported_Functions,Machine_Type,Number_of_Sections,TimeDateStamp,Characteristics
0,./MALWR/AAAz2E1B6940985A23E5639450F8391820655,"[(UPX0, 0, 3758096512), (UPX1, 342528, 3758096...","[RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...","[RundllInstall, RundllUninstall, ServiceInstal...",0.0,0.0,0.999999,1.0
1,./MALWR/NBV_8B75BCBFF174C25A0161F30758509A44,"[(UPX0, 0, 3758096512), (UPX1, 4096, 375809644...","[LoadLibraryA, ExitProcess, GetProcAddress, Vi...",[],0.0,0.0,0.379422,0.0
2,./MALWR/65018CD542145A3792BA09985734C12A,"[(UPX0, 0, 3758096512), (UPX1, 4096, 375809644...","[LoadLibraryA, ExitProcess, GetProcAddress, Vi...",[],0.0,0.0,0.0,0.0
3,./MALWR/8442AE37B91F279A9F06DE4C60B286A3,"[(UPX0, 0, 3758096512), (UPX1, 3584, 375809644...","[LoadLibraryA, ExitProcess, GetProcAddress, Vi...",[],0.0,0.0,0.551407,0.0
4,./MALWR/AL65_DB05DF0498B59B42A8E493CF3C10C578,"[(UPX0, 0, 3758096512), (UPX1, 342528, 3758096...","[RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...","[RundllInstall, RundllUninstall, ServiceInstal...",0.0,0.0,1.0,1.0


In [115]:
# 2. Codificación One-Hot para el nombre de las secciones

section_dummies = pd.get_dummies(mw_data['Section_Features'].apply(pd.Series).stack(), prefix='Section')
mw_data = pd.concat([mw_data, section_dummies], axis=1)

mw_data.drop(columns=['Section_Features'], inplace=True)

mw_data.head()

Unnamed: 0,File_Path,Imported_Functions,Exported_Functions,Machine_Type,Number_of_Sections,TimeDateStamp,Characteristics,"Section_('.data', 1536, 3221225536)","Section_('.rdata', 3072, 1073741888)","Section_('.rsrc', 512, 3221225536)",...,"Section_('UPX0', 0, 3758096512)","Section_('UPX1', 3584, 3758096448)","Section_('UPX1', 4096, 3758096448)","Section_('UPX1', 11776, 3758096448)","Section_('UPX1', 283648, 3758096448)","Section_('UPX1', 292352, 3758096448)","Section_('UPX1', 342528, 3758096448)","Section_('UPX1', 346112, 3758096448)","Section_('UPX2', 512, 3221225536)","Section_('UPX2', 1024, 3221225536)"
0,./MALWR/AAAz2E1B6940985A23E5639450F8391820655,"[RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...","[RundllInstall, RundllUninstall, ServiceInstal...",0.0,0.0,0.999999,1.0,,,,...,,,,,,,,,,
1,./MALWR/NBV_8B75BCBFF174C25A0161F30758509A44,"[LoadLibraryA, ExitProcess, GetProcAddress, Vi...",[],0.0,0.0,0.379422,0.0,,,,...,,,,,,,,,,
2,./MALWR/65018CD542145A3792BA09985734C12A,"[LoadLibraryA, ExitProcess, GetProcAddress, Vi...",[],0.0,0.0,0.0,0.0,,,,...,,,,,,,,,,
3,./MALWR/8442AE37B91F279A9F06DE4C60B286A3,"[LoadLibraryA, ExitProcess, GetProcAddress, Vi...",[],0.0,0.0,0.551407,0.0,,,,...,,,,,,,,,,
4,./MALWR/AL65_DB05DF0498B59B42A8E493CF3C10C578,"[RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...","[RundllInstall, RundllUninstall, ServiceInstal...",0.0,0.0,1.0,1.0,,,,...,,,,,,,,,,


In [116]:
# mw_data= mw_data.fillna(0)  # Por ejemplo, rellenar con ceros para valores nulos


In [117]:
# 3. Representación de Bag-of-Words para funciones importadas y exportadas
print(mw_data['Imported_Functions'])

# Join imported functions only if they are not empty
try:
    imported_functions = mw_data['Imported_Functions'].apply(lambda x: ' '.join(x) if x else '')
    exported_functions = mw_data['Exported_Functions'].apply(lambda x: ' '.join(x) if x else '')
except:
    pass
# imported_functions = mw_data['Imported_Functions'].apply(lambda x: ' '.join(x) if x else '')
# exported_functions = mw_data['Exported_Functions'].apply(lambda x: ' '.join(x) if x else '')

# Join imported functions on a single string

print(imported_functions.head())
# imported_functions = ' '.join(imported_functions)
# exported_functions = ' '.join(exported_functions)


# print(imported_functions)

# vectorizer = CountVectorizer()
# imported_bow = vectorizer.fit_transform(imported_functions)
# exported_bow = vectorizer.fit_transform(exported_functions)

# imported_bow = pd.DataFrame(imported_bow.toarray(), columns=vectorizer.get_feature_names_out())
# exported_bow = pd.DataFrame(exported_bow.toarray(), columns=vectorizer.get_feature_names_out())

# mw_data = pd.concat([mw_data, imported_bow, exported_bow], axis=1)

# # mw_data.drop(columns=['Imported_Functions', 'Exported_Functions'], inplace=True)

# mw_data.head()

0          [RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...
1          [LoadLibraryA, ExitProcess, GetProcAddress, Vi...
2          [LoadLibraryA, ExitProcess, GetProcAddress, Vi...
3          [LoadLibraryA, ExitProcess, GetProcAddress, Vi...
4          [RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...
                                 ...                        
(38, 1)                                                  NaN
(38, 2)                                                  NaN
(39, 0)                                                  NaN
(39, 1)                                                  NaN
(39, 2)                                                  NaN
Name: Imported_Functions, Length: 161, dtype: object
0    [RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...
1    [LoadLibraryA, ExitProcess, GetProcAddress, Vi...
2    [LoadLibraryA, ExitProcess, GetProcAddress, Vi...
3    [LoadLibraryA, ExitProcess, GetProcAddress, Vi...
4    [RegSaveKeyA, BitBlt, LoadLibraryA, GetProcAdd...
N

In [118]:
# # 3. Representación de Bag-of-Words para funciones importadas y exportadas
# vectorizer = CountVectorizer()
# imported_functions_bow = vectorizer.fit_transform(malware_dataframe['Imported_Functions'].apply(lambda x: ' '.join(x)))
# exported_functions_bow = vectorizer.fit_transform(malware_dataframe['Exported_Functions'].apply(lambda x: ' '.join(x)))

# # Concatenar las representaciones Bag-of-Words al DataFrame
# malware_dataframe = pd.concat([malware_dataframe, pd.DataFrame(imported_functions_bow.toarray(), columns=vectorizer.get_feature_names_out(prefix='Imported_'))], axis=1)
# malware_dataframe = pd.concat([malware_dataframe, pd.DataFrame(exported_functions_bow.toarray(), columns=vectorizer.get_feature_names_out(prefix='Exported_'))], axis=1)

# # Eliminar las columnas originales de funciones importadas y exportadas
# malware_dataframe = malware_dataframe.drop(['Imported_Functions', 'Exported_Functions'], axis=1)

# # 4. Manejar valores nulos si es necesario
# malware_dataframe = malware_dataframe.fillna(0)  # Por ejemplo, rellenar con ceros para valores nulos

# # Ahora, el DataFrame 'malware_dataframe' está preprocesado y listo para ser utilizado en algoritmos de aprendizaje no supervisado
# print(malware_dataframe.head())