In [1]:
import pandas as pd


# Load the dataset
file_path = "/mnt/data/Microsoft_malware_dataset_min.csv"
df = pd.read_csv(file_path)

# Display basic information about the dataset
df_info = df.info()
df_head = df.head()
df_description = df.describe(include='all')
missing_values = df.isnull().sum()
duplicate_rows = df.duplicated().sum()

df_info, df_head, df_description, missing_values, duplicate_rows



In [None]:
# Step 1: Drop duplicates
df_cleaned = df.drop_duplicates()

# Step 2: Handle missing values
# We'll fill numerical columns with median and categorical with mode
for col in df_cleaned.select_dtypes(include=['float64', 'int64']).columns:
    if df_cleaned[col].isnull().sum() > 0:
        df_cleaned[col].fillna(df_cleaned[col].median(), inplace=True)

for col in df_cleaned.select_dtypes(include=['object']).columns:
    if df_cleaned[col].isnull().sum() > 0:
        df_cleaned[col].fillna(df_cleaned[col].mode()[0], inplace=True)

# Step 3: Encode categorical columns using one-hot encoding
df_encoded = pd.get_dummies(df_cleaned, drop_first=True)

# Step 4: Check for outliers using IQR method and remove them (for numerical columns only)
Q1 = df_encoded.quantile(0.25)
Q3 = df_encoded.quantile(0.75)
IQR = Q3 - Q1
df_no_outliers = df_encoded[~((df_encoded < (Q1 - 1.5 * IQR)) | (df_encoded > (Q3 + 1.5 * IQR))).any(axis=1)]

# Final shape after preprocessing
df_no_outliers.shape


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

# Separate features and target
X = df_no_outliers.drop("HasDetections", axis=1)
y = df_no_outliers["HasDetections"]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# Train Decision Tree with default parameters
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict probabilities
y_prob = dt_model.predict_proba(X_test)[:, 1]

# Compute ROC curve and AUC
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

# Plot ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Decision Tree')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()
