In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import OneClassSVM
from io import BytesIO
import pickle
import os

In [14]:
# Second cell - Function to process uploaded file
def process_traffic_data(file_path):
    """
    Process traffic data from the uploaded CSV file
    """
    # Load dataset
    df = pd.read_csv(file_path)
    
    # Print basic info
    print("Dataset Preview:")
    print(df.head())
    print(f"Dataset shape: {df.shape}")
    
    # Convert Time to timestamp format
    if 'Time' in df.columns:
        start_time = datetime.now() - timedelta(hours=24)  # Assume log starts 24 hours ago
        df["Timestamp"] = df["Time"].apply(lambda x: start_time + timedelta(seconds=x))
    
    # Remove duplicate rows
    original_count = len(df)
    df = df.drop_duplicates()
    print(f"Removed {original_count - len(df)} duplicate records.")
    
    return df

In [15]:
# Third cell - Generate traffic analysis
def generate_traffic_analysis(df):
    """
    Generate traffic analysis at different time intervals
    """
    # Ensure Timestamp is in datetime format
    df["Timestamp"] = pd.to_datetime(df["Timestamp"], errors="coerce")

    # Aggregate traffic per minute
    traffic_per_minute = df.groupby(df["Timestamp"].dt.floor("min")).size().reset_index(name="Minute_Request_Count")

    # Aggregate traffic per hour
    traffic_per_hour = df.groupby(df["Timestamp"].dt.floor("h")).size().reset_index(name="Hourly_Request_Count")

    # Aggregate traffic per day
    traffic_per_day = df.groupby(df["Timestamp"].dt.floor("D")).size().reset_index(name="Daily_Request_Count")
    
    return traffic_per_minute, traffic_per_hour, traffic_per_day

In [16]:
# Fourth cell - Detect traffic anomalies
def detect_traffic_anomalies(traffic_df, count_column):
    """
    Detect anomalies in traffic using Z-score method
    """
    # Calculate Z-scores
    traffic_df["Z_Score"] = (traffic_df[count_column] - traffic_df[count_column].mean()) / traffic_df[count_column].std()
    
    # Flag anomalies where Z-score > 3
    traffic_df["Anomaly"] = np.abs(traffic_df["Z_Score"]) > 3
    
    return traffic_df

In [17]:
# Fifth cell - Generate plot functions that save to files rather than displaying
def generate_traffic_plots(traffic_per_minute, traffic_per_hour, traffic_per_day, output_dir="plots"):
    """
    Generate traffic plots and save as files
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Plot Traffic Per Minute
    plt.figure(figsize=(12, 4))
    plt.plot(traffic_per_minute["Timestamp"], traffic_per_minute["Minute_Request_Count"], color="blue", label="Minute Traffic")
    plt.xlabel("Time")
    plt.ylabel("Requests per Minute")
    plt.title("Website Traffic Per Minute")
    plt.legend()
    plt.savefig(f"{output_dir}/minute_traffic.png")
    plt.close()
    
    # Plot Traffic Per Hour
    plt.figure(figsize=(12, 4))
    plt.plot(traffic_per_hour["Timestamp"], traffic_per_hour["Hourly_Request_Count"], color="purple", marker='o', linestyle='-', label="Hourly Traffic")
    plt.xlabel("Time")
    plt.ylabel("Requests per Hour")
    plt.title("Website Traffic Per Hour")
    plt.legend()
    plt.savefig(f"{output_dir}/hour_traffic.png")
    plt.close()
    
    # Plot Traffic Per Day
    plt.figure(figsize=(12, 4))
    plt.plot(traffic_per_day["Timestamp"], traffic_per_day["Daily_Request_Count"], color="orange", marker='o', linestyle='-', label="Daily Traffic")
    plt.xlabel("Date")
    plt.ylabel("Requests per Day")
    plt.title("Website Traffic Per Day")
    plt.legend()
    plt.savefig(f"{output_dir}/day_traffic.png")
    plt.close()
    
    print(f"Traffic plots saved to {output_dir} directory")

In [18]:
# Sixth cell - Generate anomaly plots
def generate_anomaly_plots(traffic_per_minute, traffic_per_hour, traffic_per_day, output_dir="plots"):
    """
    Generate anomaly plots and save as files
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    def plot_anomalies(df, count_column, title, filename):
        plt.figure(figsize=(12, 6))
        plt.scatter(df["Timestamp"], df[count_column], c=df["Anomaly"].astype(int), cmap="coolwarm", label="Requests")
        plt.xlabel("Timestamp")
        plt.ylabel(count_column)
        plt.title(title)
        plt.colorbar(label="Anomaly")
        plt.savefig(f"{output_dir}/{filename}")
        plt.close()
    
    # Plot anomalies
    plot_anomalies(traffic_per_minute, "Minute_Request_Count", "Minute-Level Traffic Anomalies", "minute_anomalies.png")
    plot_anomalies(traffic_per_hour, "Hourly_Request_Count", "Hourly-Level Traffic Anomalies", "hour_anomalies.png")
    plot_anomalies(traffic_per_day, "Daily_Request_Count", "Daily-Level Traffic Anomalies", "day_anomalies.png")
    
    print(f"Anomaly plots saved to {output_dir} directory")


In [19]:
# Seventh cell - Detect packet anomalies using One-Class SVM
def detect_packet_anomalies(df):
    """
    Use One-Class SVM to detect packet-level anomalies
    """
    # Feature Selection - Keep relevant numerical features
    features = ['Length']
    if 'Time_Difference' in df.columns:
        features.append('Time_Difference')

    # Encode categorical features
    categorical_cols = ['Source', 'Destination', 'Protocol']
    for col in categorical_cols:
        if col in df.columns:
            df[col] = LabelEncoder().fit_transform(df[col])
            features.append(col)

    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(df[features])

    # Train One-Class SVM model
    svm_model = OneClassSVM(kernel='rbf', gamma='auto', nu=0.01)
    svm_model.fit(X)

    # Predict anomalies
    df['Anomaly'] = svm_model.predict(X)
    df['Anomaly'] = df['Anomaly'].apply(lambda x: 1 if x == -1 else 0)
    
    # Save the SVM model for future use
    pickle.dump(svm_model, open("svm_anomaly_model.pkl", "wb"))
    pickle.dump(scaler, open("feature_scaler.pkl", "wb"))
    
    return df, features

In [20]:
# Categorize anomalies - Improved Dynamic Version
def categorize_anomalies(df):
    """
    Categorize detected anomalies dynamically based on dataset size.
    """
    # Categorize based on Length
    def categorize_length(length):
        if length > 1000:
            return "Large Packet"
        elif length < 50:
            return "Small Packet"
        else:
            return "Normal Size"

    df["Anomaly_Category"] = df["Length"].apply(categorize_length)

    # Categorize based on Protocol
    protocol_mapping = {
        6: "TCP",
        17: "UDP",
        1: "ICMP",
        9: "ARP",
        303: "Unknown/Custom"
    }
    df["Protocol_Category"] = df["Protocol"].map(protocol_mapping).fillna("Other")

    # Categorize based on Request Frequency
    # Dynamic threshold: 2% of total records or minimum 5
    dynamic_threshold = max(5, int(len(df) * 0.05))
    print(f"Dynamic bot detection threshold set to {dynamic_threshold} requests.")

    # Calculate request frequency per source
    df["Request_Frequency"] = df["Source"].map(df["Source"].value_counts())
    df["Source_Category"] = df["Request_Frequency"].apply(
        lambda x: "Bot Suspect" if x > dynamic_threshold else "Normal"
    )

    return df


In [21]:
def identify_bot_suspects(df):
    """
    Identify bot suspects based on request behavior
    """
    if 'Time_Difference' not in df.columns or 'Source' not in df.columns:
        print("Required columns missing for bot detection.")
        return df

    # Fast repeated requests (e.g., requests within <0.5 seconds)
    df["Fast_Request"] = df["Time_Difference"] < 0.5

    # Group by Source and count fast requests
    fast_request_counts = df.groupby("Source")["Fast_Request"].sum().reset_index(name="Fast_Requests")

    # Threshold: Sources with a lot of fast requests are suspected bots
    suspect_sources = fast_request_counts[fast_request_counts["Fast_Requests"] > 50]["Source"]

    # Mark in the main dataframe
    df["Bot_Suspect"] = df["Source"].isin(suspect_sources)

    return df


In [22]:
# Ninth cell - Generate anomaly report
def generate_anomaly_report(df, output_dir="reports"):
    """
    Generate a comprehensive anomaly report
    """
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Get outliers only
    outliers = df[df['Anomaly'] == 1]
    
    # Save outliers and full report
    outliers.to_csv(f"{output_dir}/outliers_detected.csv", index=False)
    df.to_csv(f"{output_dir}/final_anomalies_report.csv", index=False)
    
    # Calculate statistics
    anomaly_percentage = df["Anomaly"].mean() * 100
    print(f"Detected {anomaly_percentage:.2f}% of traffic as anomalies.")
    
    # Anomaly summary
    anomaly_summary = {
        "total_records": len(df),
        "anomalies_detected": int(df["Anomaly"].sum()),
        "anomaly_percentage": anomaly_percentage,
        "anomaly_categories": outliers["Anomaly_Category"].value_counts().to_dict(),
        "protocol_categories": outliers["Protocol_Category"].value_counts().to_dict(),
        "source_categories": outliers["Source_Category"].value_counts().to_dict()
    }
    
    # Save summary as JSON
    with open(f"{output_dir}/anomaly_summary.json", "w") as f:
        import json
        json.dump(anomaly_summary, f, indent=4)
    
    print(f"Reports saved to {output_dir} directory")
    return anomaly_summary, outliers

In [23]:
# Tenth cell - Main function to process data
def main(file_path, output_dir="output"):
    """
    Main function to process the data and generate results
    """
    # Create output directories
    plots_dir = f"{output_dir}/plots"
    reports_dir = f"{output_dir}/reports"
    
    # Process the uploaded file
    df = process_traffic_data(file_path)
    
    # Generate traffic analysis
    traffic_per_minute, traffic_per_hour, traffic_per_day = generate_traffic_analysis(df)
    
    # Detect anomalies in traffic
    traffic_per_minute = detect_traffic_anomalies(traffic_per_minute, "Minute_Request_Count")
    traffic_per_hour = detect_traffic_anomalies(traffic_per_hour, "Hourly_Request_Count")
    traffic_per_day = detect_traffic_anomalies(traffic_per_day, "Daily_Request_Count")
    
    # Generate traffic plots
    generate_traffic_plots(traffic_per_minute, traffic_per_hour, traffic_per_day, plots_dir)
    
    # Generate anomaly plots
    generate_anomaly_plots(traffic_per_minute, traffic_per_hour, traffic_per_day, plots_dir)
    
    # Detect packet-level anomalies
    df, features_used = detect_packet_anomalies(df)
    
    # Categorize anomalies
    df = categorize_anomalies(df)

    df = identify_bot_suspects(df)

    # Generate anomaly report
    anomaly_summary, outliers = generate_anomaly_report(df, reports_dir)
    
    print("Processing complete!")
    print(f"Results saved to {output_dir} directory")
    
    return {
        "processed_data": df,
        "traffic_analysis": {
            "minute": traffic_per_minute,
            "hour": traffic_per_hour,
            "day": traffic_per_day
        },
        "anomaly_summary": anomaly_summary,
        "outliers": outliers
    }


In [24]:
# Eleventh cell - Example usage (when running in Jupyter)
# Run this cell to test the functionality
if __name__ == "__main__":
    import sys
    
    # For testing with a sample file
    # Replace "sample_traffic.csv" with your test file
    sample_file = "anomalies_dataset.csv"
    
    if os.path.exists(sample_file):
        results = main(sample_file)
        print("Analysis completed successfully!")
    else:
        print(f"File '{sample_file}' not found. Please provide a valid file path.")

File 'anomalies_dataset.csv' not found. Please provide a valid file path.
