In [2]:
import pandas as pd
import json
import re

# Load the dataset
file_path = './data/extracted_data.csv'  # Replace with your file path
data = pd.read_csv(file_path)

# Step 1: General Cleaning
# Remove duplicate rows based on relevant columns
cleaned_data = data.drop_duplicates(subset=['exploit_link', 'description'])

# Standardize the date format
cleaned_data['date'] = pd.to_datetime(cleaned_data['date'], errors='coerce')

# Remove unnecessary columns (e.g., 'title')
cleaned_data = cleaned_data.drop(columns=['title'])

# Drop rows with missing or invalid 'file_data'
cleaned_data = cleaned_data.dropna(subset=['file_data'])

# Step 2: Clean and Normalize the `file_data` Column
def clean_file_data(value):
    if pd.isna(value):
        return None
    try:
        # Attempt to parse the value as JSON
        parsed = json.loads(value)
        if isinstance(parsed, dict):
            return parsed
        else:
            return value
    except (json.JSONDecodeError, TypeError):
        # Return None for invalid JSON, else keep the original value
        return value if isinstance(value, str) else None

# Apply the cleaning function to the 'file_data' column
cleaned_data['file_data'] = cleaned_data['file_data'].apply(clean_file_data)

# Step 3: Remove special characters from the `file_data` field based on specific logic
def clean_special_characters_from_file_data(text):
    if not isinstance(text, str):
        return text
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        # If the line has only special characters and does not represent a comment, remove it
        if re.match(r'^\s*[!@#$%^&*()_+=\[\]{}|;:\'",.<>?/\\`~\-]+\s*$', line):
            continue
        cleaned_lines.append(line)
    return "\n".join(cleaned_lines)

# Apply the cleaning function to the `file_data` field
cleaned_data['file_data'] = cleaned_data['file_data'].apply(clean_special_characters_from_file_data)

# Step 4: Further Processing - Split Description and File Data
# Define a function to split the description into 'device_detail' and 'exploited_component'
def split_description(description):
    if ' - ' in description:
        parts = description.split(' - ', 1)
        return parts[0], parts[1]
    return description, ''  # If no separator is found, return the whole description as device_detail

# Apply the function to create new columns
cleaned_data['device_detail'], cleaned_data['exploited_component'] = zip(*cleaned_data['description'].apply(split_description))

# Step 5: Define a function to separate file_data into 'details' and 'code' (moving sources to 'details')
def split_file_data(file_data):
    if not isinstance(file_data, str):
        return "", ""

    lines = file_data.split('\n')
    details = []
    code = []

    for line in lines:
        stripped_line = line.strip()

        # Skip lines that appear to be URLs, comments, or descriptions
        if re.match(r'^\s*(#|\/\/|\/\*|https?://|www\.|\.com|\.org)', stripped_line):
            details.append(stripped_line)
            continue
        
        # Skip lines that describe the code or refer to sources
        elif re.match(r'^\s*(description:|desc:|summary:|source:|src:|sources:|source path:)', stripped_line, re.IGNORECASE):
            details.append(stripped_line)
            continue
        
        # Skip lines that are just commands or script metadata, such as echo or curl
        elif re.match(r'^\s*(echo|curl|wget|sh|bash|python|perl|ruby|chmod|chown|mkdir|rm|cat|ls|tar|grep|awk|sed|lsblk|cp|mv)', stripped_line):
            code.append(stripped_line)
            continue

        # Treat any line that looks like a code snippet
        elif re.match(r'^\s*(if|for|while|function|return|class|def|try|catch|import|from|exec|require|open|read|write)', stripped_line):
            code.append(stripped_line)
            continue

        # Treat remaining lines as code
        elif stripped_line and not stripped_line.startswith(('description:', 'desc:', 'summary:', 'source:', 'src:', 'sources:', 'source path:')):
            code.append(stripped_line)
    
    # Join the details and code separately
    details = "\n".join(details).strip()
    code = "\n".join(code).strip()
    return details, code

# Apply the function to create new 'details' and 'code' columns
cleaned_data['details'], cleaned_data['code'] = zip(*cleaned_data['file_data'].apply(split_file_data))

# Step 6: Drop unnecessary columns after splitting
cleaned_data = cleaned_data.drop(columns=['exploit_link', 'description', 'file_data'])

# Step 7: Save the cleaned and processed dataset to a new CSV file
output_path = './data/processed_extracted_code_data.csv'  # Path for saving only the extracted code
cleaned_data.to_csv(output_path, index=False)

print(f"Processed data (only code) has been saved to {output_path}")


Processed data (only code) has been saved to ./data/processed_extracted_code_data.csv


  cleaned_data['date'] = pd.to_datetime(cleaned_data['date'], errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data['date'] = pd.to_datetime(cleaned_data['date'], errors='coerce')
