# 📊 Log Analysis Dashboard
This notebook provides insights and visualizations based on the log data collected through the pipeline.
---

In [None]:
# Required libraries
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

In [None]:
# Connect to the SQLite database
conn = sqlite3.connect('../sqlite/siem_logs.db')
query = 'SELECT * FROM siem_logs'
df = pd.read_sql_query(query, conn)
df.head()

## 🧼 Basic Cleaning & Info

In [None]:
df['timestamp'] = pd.to_datetime(df['timestamp'])
df.info()

## 📊 Device Distribution

In [None]:
device_counts = df['device'].value_counts()
fig = px.pie(names=device_counts.index, values=device_counts.values, title='Device Usage Distribution')
fig.show()

## 📈 Volume of Logs Over Time

In [None]:
df.set_index('timestamp', inplace=True)
logs_per_hour = df.resample('1H').size()
logs_per_hour.plot(figsize=(12,6), title='Log Volume Over Time')
plt.xlabel('Time')
plt.ylabel('Number of Logs')
plt.grid(True)
plt.tight_layout()
plt.show()

## 👤 Top 10 Users by Number of Events

In [None]:
top_users = df['user'].value_counts().head(10)
sns.barplot(y=top_users.index, x=top_users.values, palette='viridis')
plt.title('Top 10 Users')
plt.xlabel('Number of Events')
plt.ylabel('User')
plt.tight_layout()
plt.show()

## 🌍 IP Geolocation (Example-based)

In [None]:
# Optional: Geolocation (mock example, since Faker IPs are random)
# from ip2geotools.databases.noncommercial import DbIpCity
# sample_ips = df['ip_address'].dropna().unique()[:5]
# for ip in sample_ips:
#     response = DbIpCity.get(ip, api_key='free')
#     print(f'{ip} -> {response.city}, {response.country}')

## 🚨 Number of Anomalous Events

In [None]:
df['anomalous'].value_counts().plot(kind='bar', title='Anomalous vs Normal Events')
plt.xticks(rotation=0)
plt.ylabel('Count')
plt.grid(True)
plt.tight_layout()
plt.show()

## ✅ Status Distribution

In [None]:
df['status'].value_counts().plot(kind='barh', color='skyblue', title='Event Status Distribution')
plt.xlabel('Count')
plt.tight_layout()
plt.show()

## 💾 Export to CSV

In [None]:
df.reset_index().to_csv('processed_logs.csv', index=False)
print('Exported to processed_logs.csv')