In [43]:
import pandas as pd
import numpy as np

In [44]:
file_path = 'C:/Users/xk_am/Downloads/train.csv'
data = pd.read_csv(file_path)
df = data.copy()
df.head()

Unnamed: 0,flow_time,header_size,packet_duration,overall_rate,src_rate,dst_rate,fin_packets,urg_packets,rst_packets,max_value,...,syn_flags,rst_flags,psh_flags,ack_flags,protocol_http,protocol_https,protocol_tcp,protocol_udp,protocol_icmp,label
0,0.041268,15499.0,64.0,7805.845961,7805.845961,0.0,0.0,0.0,0.0,50.0,...,0,0,0,0,0,0,0,1,0,DDoS
1,0.018393,3702.54,64.0,6728.994198,6728.994198,0.0,0.0,0.0,0.01,54.28,...,0,0,0,0,0,0,0,1,0,DDoS
2,0.0,182.0,64.0,38.559448,38.559448,0.0,0.0,0.0,0.0,182.0,...,0,0,0,0,0,0,0,1,0,DoS
3,0.109292,35027.55,62.72,6783.234241,6783.234241,0.0,0.0,0.03,0.11,65.11,...,0,0,0,0,0,0,0,1,0,DoS
4,0.0,162.0,64.0,2.305494,2.305494,0.0,0.0,0.0,0.0,162.0,...,0,0,0,0,0,0,0,1,0,DoS


In [45]:
def add_all_features(df, group_key=None, eps=1e-6):
    df2 = df.copy()

    if group_key is not None:
        agg_defs = {
            'flow_time':       ['mean','std','min','max'],
            'header_size':     ['mean','std'],
            'packet_duration': ['mean','max'],
            'fin_packets':     ['sum'],
            'urg_packets':     ['sum'],
            'rst_packets':     ['sum'],
            'syn_flags':       ['sum'],
            'ack_flags':       ['mean'],
            'protocol_http':   ['sum'],
            'protocol_https':  ['sum'],
        }

        for col, funcs in agg_defs.items():
            for fn in funcs:
                df2[f"{col}_{fn}"] = df2.groupby(group_key)[col].transform(fn)

        if 'label' in df2.columns:
            modes = df2.groupby(group_key)['label'].agg(lambda x: x.mode().iloc[0])
            df2['label_mode'] = df2[group_key].map(modes)
    total_packets = df2["fin_packets"] + df2["urg_packets"] + df2["rst_packets"]
    df2["packet_rate"] = total_packets / (df2["packet_duration"] + eps)

    flag_cols = ['fin_flags', 'syn_flags', 'rst_flags', 'psh_flags', 'ack_flags']
    df2['total_flags'] = df2[flag_cols].sum(axis=1)
    df2['flags_per_packet'] = df2['total_flags'] / (total_packets + eps)
    df2['syn_ack_ratio'] = df2['syn_flags'] / (df2['ack_flags'] + eps)
    df2['rate_asymmetry'] = (
        (df2['src_rate'] - df2['dst_rate']) /
        (df2['src_rate'] + df2['dst_rate'] + eps)
    )
    df2['header_to_payload_ratio'] = (
        df2['header_size'] /
        ((df2['overall_rate'] / (df2['packet_rate'] + eps)) + eps)
    )

    return df2

df = add_all_features(df)

In [46]:
df.shape

(938583, 28)

In [47]:
target = df['label']

In [48]:
numerical_columns = df.select_dtypes(include=np.number).columns
for col in numerical_columns:
    if (df[col] < 0).any():
        df[col] = df[col].clip(lower=0)

In [49]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [50]:
df['label'] = label_encoder.fit_transform(df['label'])

In [51]:
target = df['label'].value_counts()

In [52]:
X = df.drop(columns=['label'])
y = df['label']

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [54]:
from sklearn.preprocessing import QuantileTransformer

scaler = QuantileTransformer(output_distribution='normal')

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.feature_selection import VarianceThreshold

#apply variance thresholding
vt = VarianceThreshold(threshold=0.01)
X_train_vt = vt.fit_transform(X_train_scaled)

#get the selected feature columns
selected_vt_cols = X.columns[vt.get_support()]

#create df with the selected columns for train and test
X_train = pd.DataFrame(X_train_vt, columns=selected_vt_cols)
X_test = pd.DataFrame(X_test_scaled, columns=X.columns)
X_test = X_test[selected_vt_cols]

print(f"Remaining features after Variance Thresholding: {len(X_train.columns)}")

Remaining features after Variance Thresholding: 26


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_scaled, y_train)
y_pred_rf = rf_model.predict(X_test_scaled)
acc_rf = accuracy_score(y_test, y_pred_rf)

print(f"Random Forest Accuracy: {acc_rf:.4f}")

KeyboardInterrupt: 

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

improved_params = {
    'learning_rate':     0.18295938520166618,
    'max_depth':         16,
    'subsample':         0.9821770843130437,
    'colsample_bytree':  0.8700184979100425,
    'gamma':             0.18617821128570886,
    'reg_alpha':         0.452075050028747287,
    'reg_lambda':        5.879866631339201,
    'min_child_weight':  2,
    'n_estimators':      400,
    'tree_method':      'gpu_hist',
    'device':           'gpu',
    'objective':        'multi:softprob',
}
xgb_model = XGBClassifier(
    **improved_params,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train_scaled, y_train)

y_pred_xgb = xgb_model.predict(X_test_scaled)
acc_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"XGBoost Accuracy: {acc_xgb:.16f}")


    E.g. tree_method = "hist", device = "cuda"

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)

    E.g. tree_method = "hist", device = "cuda"

  if len(data.shape) != 1 and self.num_features() != data.shape[1]:


XGBoost Accuracy: 0.9147493301086209


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_scaled, y_train)
y_pred_gb = gb_model.predict(X_test_scaled)
acc_gb = accuracy_score(y_test, y_pred_gb)
print(f"Gradient Boosting Accuracy: {acc_gb:.4f}")

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_scaled, y_train)
y_pred_dt = dt_model.predict(X_test_scaled)
acc_dt = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {acc_dt:.16f}")

In [35]:
from lightgbm import LGBMClassifier
lgb_clf = {
    "device": "cpu",
    "objective": "multiclass",
    "num_class": len(set(y_train)),
    "learning_rate": 0.02,
    "n_estimators": 500,
    "max_depth": 9,
    "num_leaves": 63,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "min_child_samples": 10,
    "reg_alpha": 0.05,
    "reg_lambda": 0.5,
    "random_state": 42,
    "eval_metric": "multi_logloss",
    "verbose": -1
}
lgbm_model = LGBMClassifier(**lgb_clf)
lgbm_model.fit(X_train_scaled, y_train)
y_pred_lgbm = lgbm_model.predict(X_test_scaled)
acc_lgbm = accuracy_score(y_test, y_pred_lgbm)
print(f"LightGBM Accuracy: {acc_lgbm:.16f}")



LightGBM Accuracy: 0.9121763079529291


In [None]:
test = pd.read_csv('/content/drive/My Drive/ML project/test.csv')
test = add_all_features(test)

In [None]:
test_ids = test['Id']
test_features = test.drop(columns=['Id'])

#same preprocessing on test data
for col in numerical_columns:
    if col in test_features.columns and (test_features[col] < 0).any():
        test_features[col] = test_features[col].clip(lower=0)

#scaling
test_scaled = scaler.transform(test_features)

#predict and save
preds_rf = rf_model.predict(test_scaled)
preds_rf_decoded = label_encoder.inverse_transform(preds_rf)
submission_rf = pd.DataFrame({'Id': test_ids, 'label': preds_rf_decoded})
submission_rf.to_csv(
    '/content/drive/My Drive/ML project/submission_random_forest2.csv',
    index=False
)
print("Random Forest submission file created: submission_random_forest.csv")

# XGB submission
preds_xgb = xgb_model.predict(test_scaled)
preds_xgb_decoded = label_encoder.inverse_transform(preds_xgb)
submission_xgb = pd.DataFrame({'Id': test_ids, 'label': preds_xgb_decoded})
submission_xgb.to_csv(
    '/content/drive/My Drive/ML project/submission_xgboost.csv',
    index=False
)
print("XGBoost submission file created: submission_xgboost.csv")

#gradient boosting submission
preds_gb = gb_model.predict(test_scaled)
preds_gb_decoded = label_encoder.inverse_transform(preds_gb)
submission_gb = pd.DataFrame({'Id': test_ids, 'label': preds_gb_decoded})
submission_gb.to_csv(
    '/content/drive/My Drive/ML project/submission_gradient_boosting.csv',
    index=False
)
print("Gradient Boosting submission file created: submission_gradient_boosting.csv")


#lightGBM submission
preds_lgbm = lgbm_model.predict(test_scaled)
preds_lgbm_decoded = label_encoder.inverse_transform(preds_lgbm)
submission_lgbm = pd.DataFrame({'Id': test_ids, 'label': preds_lgbm_decoded})
submission_lgbm.to_csv(
    '/content/drive/My Drive/ML project/submission_lightgbm.csv',
    index=False
)
print("LightGBM submission file created: submission_lightgbm.csv")

#decision tree submission
preds_dt = dt_model.predict(test_scaled)
preds_dt_decoded = label_encoder.inverse_transform(preds_dt)
submission_dt = pd.DataFrame({'Id': test_ids, 'label': preds_dt_decoded})
submission_dt.to_csv(
    '/content/drive/My Drive/ML project/submission_decision_tree.csv',
    index=False
)
print("Decision Tree submission file created: submission_decision_tree.csv")
