<a href="https://colab.research.google.com/github/arpitarumma/DataPrivacySecurity/blob/main/Forensic_Log_Analyser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio pandas numpy scikit-learn matplotlib seaborn


Collecting gradio
  Downloading gradio-5.23.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [2]:
import re
import pandas as pd
import numpy as np
import gradio as gr
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
from io import StringIO

# Function to parse log data
def parse_logs(logs):
    pattern = r"(\w{3} \d{2} \d{2}:\d{2}:\d{2}) (.+?): (.+)"
    parsed_data = []

    for log in logs:
        match = re.search(pattern, log)
        if match:
            timestamp, source, message = match.groups()
            ip_match = re.search(r'\b(?:\d{1,3}\.){3}\d{1,3}\b', message)
            ip = ip_match.group(0) if ip_match else "Unknown"
            parsed_data.append([timestamp, source, message, ip])

    return pd.DataFrame(parsed_data, columns=["Timestamp", "Source", "Message", "IP"])

# Example log lines (replace with uploaded logs)
log_data = """
Mar 25 14:02:23 server sshd[1234]: Failed password for root from 192.168.1.100 port 22 ssh2
Mar 25 14:03:45 server sshd[5678]: Accepted password for user1 from 10.0.0.5 port 54000 ssh2
Mar 25 14:04:12 server sudo: user2 : TTY=pts/1 ; PWD=/home/user2 ; COMMAND=/bin/rm -rf /
Mar 25 14:05:02 server kernel: Firewall blocked IP 172.16.5.5
"""

# Parse sample logs
log_df = parse_logs(log_data.split("\n"))
print(log_df)


         Timestamp             Source  \
0  Mar 25 14:02:23  server sshd[1234]   
1  Mar 25 14:03:45  server sshd[5678]   
2  Mar 25 14:04:12        server sudo   
3  Mar 25 14:05:02      server kernel   

                                             Message             IP  
0  Failed password for root from 192.168.1.100 po...  192.168.1.100  
1  Accepted password for user1 from 10.0.0.5 port...       10.0.0.5  
2  user2 : TTY=pts/1 ; PWD=/home/user2 ; COMMAND=...        Unknown  
3                     Firewall blocked IP 172.16.5.5     172.16.5.5  


In [3]:
# Function to detect anomalies
def detect_anomalies(df):
    # Convert categorical data to numerical format
    df["Source_Code"] = pd.factorize(df["Source"])[0]
    df["IP_Code"] = pd.factorize(df["IP"])[0]

    # Train Isolation Forest
    features = df[["Source_Code", "IP_Code"]]
    iso_forest = IsolationForest(n_estimators=100, contamination=0.2, random_state=42)
    df["Anomaly_Score"] = iso_forest.fit_predict(features)

    # Label anomalies
    df["Anomaly"] = df["Anomaly_Score"].apply(lambda x: "Anomalous" if x == -1 else "Normal")
    return df


In [4]:
# Function to plot anomalies
def plot_anomalies(df):
    fig, ax = plt.subplots(1, 2, figsize=(12, 5))

    # Pie chart for anomalies
    anomaly_counts = df["Anomaly"].value_counts()
    ax[0].pie(anomaly_counts, labels=anomaly_counts.index, autopct='%1.1f%%', colors=["lightblue", "salmon"])
    ax[0].set_title("Anomaly Distribution")

    # Bar chart for source distribution
    sns.countplot(data=df, x="Source", hue="Anomaly", ax=ax[1], palette={"Normal": "lightblue", "Anomalous": "salmon"})
    ax[1].set_title("Log Source vs. Anomalies")
    plt.xticks(rotation=45)

    plt.tight_layout()
    return fig


In [6]:
def analyze_logs(log_text):
    logs = log_text.split("\n")
    df = parse_logs(logs)

    if df.empty:
        return pd.DataFrame(), None  # Return empty outputs if no logs are provided

    df = detect_anomalies(df)
    fig = plot_anomalies(df)

    return df, fig
iface = gr.Interface(
    fn=analyze_logs,
    inputs=gr.Textbox(lines=10, placeholder="Paste your logs here..."),
    outputs=[gr.Dataframe(), gr.Plot()],
    title="🔍 Forensic Log Analyzer",
    description="Paste system logs to detect anomalies using Machine Learning.",
)

iface.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://c61b4800dcd2befc78.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


