In [55]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Load the data
print("Loading data...")
train_data = pd.read_csv('Train_Data.csv')
test_data = pd.read_csv('Test_Data.csv')

print("Train data columns:", train_data.columns)
print("Test data columns:", test_data.columns)

# Identify the correct name of the target column (case-sensitive)
target_column = [col for col in train_data.columns if col.lower() == 'attack'][0]
print(f"Target column name: {target_column}")

# Step 2: Preprocess the data
print("Preprocessing data...")

# Identify numeric and categorical columns (excluding the target column)
numeric_features = train_data.drop(target_column, axis=1).select_dtypes(include=['int64', 'float64']).columns
categorical_features = train_data.drop(target_column, axis=1).select_dtypes(include=['object']).columns

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features)
    ])

# Create a pipeline with the preprocessor and the classifier
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', XGBClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, 
                                 random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Step 3: Prepare the target variable
le = LabelEncoder()
y_train = le.fit_transform(train_data[target_column])

# Step 4: Fit the model
print("Training model...")
X_train = train_data.drop(target_column, axis=1)
model.fit(X_train, y_train)

# Step 5: Make predictions on the test set
print("Making predictions...")
y_pred = model.predict(test_data)

# Step 6: Create submission DataFrame
print("Creating submission file...")
submission = pd.DataFrame({'attack': y_pred})
submission['attack'] = le.inverse_transform(submission['attack'])

# Step 7: Save the submission to a CSV file
submission.to_csv('neptune_submission.csv', index=False)
print("Submission file has been created.")

Loading data...
Train data columns: Index(['duration', 'protocoltype', 'service', 'flag', 'srcbytes', 'dstbytes',
       'land', 'wrongfragment', 'urgent', 'hot', 'numfailedlogins', 'loggedin',
       'numcompromised', 'rootshell', 'suattempted', 'numroot',
       'numfilecreations', 'numshells', 'numaccessfiles', 'numoutboundcmds',
       'ishostlogin', 'isguestlogin', 'count', 'srvcount', 'serrorrate',
       'srvserrorrate', 'rerrorrate', 'srvrerrorrate', 'samesrvrate',
       'diffsrvrate', 'srvdiffhostrate', 'dsthostcount', 'dsthostsrvcount',
       'dsthostsamesrvrate', 'dsthostdiffsrvrate', 'dsthostsamesrcportrate',
       'dsthostsrvdiffhostrate', 'dsthostserrorrate', 'dsthostsrvserrorrate',
       'dsthostrerrorrate', 'dsthostsrvrerrorrate', 'lastflag', 'attack'],
      dtype='object')
Test data columns: Index(['duration', 'protocoltype', 'service', 'flag', 'srcbytes', 'dstbytes',
       'land', 'wrongfragment', 'urgent', 'hot', 'numfailedlogins', 'loggedin',
       'numcompro

Parameters: { "use_label_encoder" } are not used.



Making predictions...
Creating submission file...
Submission file has been created.
