# SIH PS1 - Threat Analysis Experiments

This notebook contains experiments and analysis for the cybersecurity threat detector.

## Setup

In [None]:
import sys
import os
sys.path.append('..')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from utils.text_processing import TextProcessor
from utils.url_parser import URLParser
from utils.data_loader import DataLoader

# Initialize utilities
text_processor = TextProcessor()
url_parser = URLParser()
data_loader = DataLoader()

print("✅ Setup complete!")

## Data Loading and Exploration

In [None]:
# Load sample data
sample_data = data_loader.load_sample_data()
training_df = data_loader.get_training_data()

print("📊 Data Statistics:")
stats = data_loader.get_data_stats()
for key, value in stats.items():
    print(f"  {key}: {value}")

# Display first few rows
print("\n📋 Sample Data:")
training_df.head()

## Text Analysis Experiments

In [None]:
# Test text processing on sample data
sample_texts = [
    "URGENT: Your account will be suspended! Click here now!",
    "Thank you for your recent purchase from Amazon.",
    "Congratulations! You've won $1,000,000 in our lottery!"
]

print("🔍 Text Analysis Results:")
for i, text in enumerate(sample_texts, 1):
    features = text_processor.extract_features(text)
    print(f"\nText {i}: {text[:50]}...")
    print(f"  Suspicious Score: {features['suspicious_score']:.2f}")
    print(f"  Urgency Words: {features['urgency_words']}")
    print(f"  URLs Found: {len(features['urls'])}")

## URL Analysis Experiments

In [None]:
# Test URL analysis
sample_urls = [
    "http://192.168.1.1/login",
    "https://amazon.com/orders",
    "http://bit.ly/suspicious-link",
    "https://secure-bank-verify.com/urgent-update"
]

print("🔗 URL Analysis Results:")
for i, url in enumerate(sample_urls, 1):
    risk_info = url_parser.calculate_url_risk(url)
    print(f"\nURL {i}: {url}")
    print(f"  Risk Score: {risk_info['risk_score']:.2f}")
    print(f"  Risk Level: {risk_info['risk_level']}")
    print(f"  Issues: {risk_info['issues']}")

## Data Visualization

In [None]:
# Create visualizations
plt.figure(figsize=(12, 8))

# Risk score distribution
plt.subplot(2, 2, 1)
plt.hist(training_df['risk_score'], bins=20, alpha=0.7, color='skyblue')
plt.title('Risk Score Distribution')
plt.xlabel('Risk Score')
plt.ylabel('Frequency')

# Label distribution
plt.subplot(2, 2, 2)
training_df['label'].value_counts().plot(kind='bar', color=['green', 'orange', 'red'])
plt.title('Label Distribution')
plt.xticks(rotation=45)

# Content type distribution
plt.subplot(2, 2, 3)
training_df['type'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Content Type Distribution')

# Risk score by label
plt.subplot(2, 2, 4)
sns.boxplot(data=training_df, x='label', y='risk_score')
plt.title('Risk Score by Label')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

## Model Training Experiments

In [None]:
# Prepare data for machine learning
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Filter text data for ML experiment
text_data = training_df[training_df['type'] == 'text'].copy()

if len(text_data) > 0:
    # Vectorize text
    vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
    X = vectorizer.fit_transform(text_data['content'])
    y = text_data['label']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Train model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Evaluate
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"🎯 Model Accuracy: {accuracy:.2f}")
    print("\n📊 Classification Report:")
    print(classification_report(y_test, y_pred))
else:
    print("⚠️ Not enough text data for ML experiment")

## Performance Testing

In [None]:
import time

# Test processing speed
test_texts = ["This is a test message"] * 100
test_urls = ["https://example.com"] * 100

# Text processing speed
start_time = time.time()
for text in test_texts:
    text_processor.extract_features(text)
text_time = time.time() - start_time

# URL processing speed
start_time = time.time()
for url in test_urls:
    url_parser.calculate_url_risk(url)
url_time = time.time() - start_time

print("⚡ Performance Results:")
print(f"  Text Processing: {text_time:.3f}s for 100 texts ({text_time*10:.1f}ms per text)")
print(f"  URL Processing: {url_time:.3f}s for 100 URLs ({url_time*10:.1f}ms per URL)")
print(f"  Total Throughput: {200/(text_time + url_time):.1f} items/second")

## Conclusion

This notebook demonstrates the capabilities of our threat detection utilities:

- ✅ Text processing and feature extraction
- ✅ URL analysis and risk assessment
- ✅ Data loading and management
- ✅ Performance benchmarking
- ✅ Machine learning experiments

Use this notebook to experiment with new features and improve the detection algorithms.