In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Replace '/content/drive/MyDrive/' with the path to your CSV file in Google Drive
file_path = '/content/drive/MyDrive/output.csv'

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib

In [5]:
# Read CSV file into a pandas DataFrame
df = pd.read_csv(file_path)

In [7]:
df['Label'].unique()

array(['FileInfector', 'Adware', 'Riskware', 'Trojan', 'Zeroday',
       'Backdoor', 'Banker', 'Dropper', 'NoCategory', 'PUA', 'Ransomware',
       'SMS', 'Scareware', 'Spy', 'Benign'], dtype=object)

In [8]:
df

Unnamed: 0.1,Unnamed: 0,0,1,2,3,4,5,6,7,8,...,9495,9496,9497,9498,9499,9500,9501,9502,9503,Label
0,0,0004e68a0c0b4818c4e52d4f414d9dd2,49,0,0,1,0,0,0,6,...,0,0,0,0,0,0.0,0.0,0.0,0.0,FileInfector
1,1,00b4a1d896a0513fc1ffe333e1c313f5,75,0,0,14,6,3,3,23,...,0,0,0,0,0,0.0,0.0,0.0,0.0,FileInfector
2,2,00c525d5599ab0212e15cda186234260,1,0,0,144,1,3,0,17,...,0,0,0,0,0,0.0,0.0,0.0,0.0,FileInfector
3,3,013377a94a52afc0fee128086d341e17,87,1,0,9,1,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,FileInfector
4,4,018ebddf0cbc44f26546f1b14d5bd612,1,0,0,144,1,3,0,17,...,0,0,0,0,0,0.0,0.0,0.0,0.0,FileInfector
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
384699,31749,05B54DE933E044A3EEA1685F228801C250F77761098DD2...,521,4,0,126,10,3,6,32,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
384700,31750,05B54F7600FAB04CC94F2FB25A529961C4A787EBF9E40F...,349,0,0,25,7,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
384701,31751,05B54FFB5CC39AF6134184C920F9802ADE67B95CA53556...,1023,0,0,42,3,2,5,10,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign
384702,31752,05B550A50612DA8D01F13D84E00E9A3143A6E76D8F9341...,295,4,0,27,9,0,0,2,...,0,0,0,0,0,0.0,0.0,0.0,0.0,Benign


In [9]:
X = df.drop(columns=['Unnamed: 0', '0' ,'Label'])
y = df['Label']

In [10]:
df.fillna(0, inplace=True)

In [11]:
X.iloc[1]

1       75.0
2        0.0
3        0.0
4       14.0
5        6.0
        ... 
9499     0.0
9500     0.0
9501     0.0
9502     0.0
9503     0.0
Name: 1, Length: 9503, dtype: float64

In [12]:
y.iloc[1]

'FileInfector'

In [13]:
# Encoding
le = LabelEncoder()
y = le.fit_transform(y)

In [14]:
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

In [25]:
# Create an imputer to replace NaN values with the mean
imputer = SimpleImputer(strategy='mean')

# Create a feature selection transformer
selector = SelectKBest(score_func=chi2, k=800)

# Create a pipeline to sequentially apply imputation and feature selection
pipeline = Pipeline([
    ('imputer', imputer),
    ('selector', selector)
])

# Fit the pipeline on your data
X_selected = pipeline.fit_transform(X, y)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [27]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

In [28]:
y_pred = rf_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9333645260654267


In [29]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.91      0.92      9442
           1       0.80      0.70      0.75       304
           2       0.77      0.81      0.79       185
           3       0.97      1.00      0.98     32269
           4       0.78      0.55      0.65       483
           5       0.79      0.23      0.36       135
           6       0.73      0.35      0.47       465
           7       0.92      0.58      0.71       447
           8       0.70      0.92      0.80      1196
           9       0.96      0.96      0.96     19394
          10       0.95      0.89      0.92       626
          11       0.91      0.73      0.81       341
          12       0.92      0.87      0.90       759
          13       0.80      0.94      0.87      5474
          14       0.83      0.68      0.75      5421

    accuracy                           0.93     76941
   macro avg       0.85      0.74      0.78     76941
weighted avg       0.93   

In [30]:
# Save the trained model
joblib.dump(rf_classifier, "rf_model.pkl")

['rf_model.pkl']

In [31]:
# Save the LabelEncoder
joblib.dump(le, "label_encoder.pkl")

['label_encoder.pkl']

In [32]:
# Save the SelectKBest object
joblib.dump(selector, "select_k_best.pkl")

['select_k_best.pkl']