In [None]:
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta

# Create streaming event data
np.random.seed(42)
events = []
for i in range(1000):
    properties = {
        'device_type': np.random.choice(['mobile', 'desktop', 'tablet']),
        'page_path': np.random.choice(['/home', '/products', '/checkout']),
        'session_length': np.random.randint(60, 3600)
    }
    if np.random.random() > 0.7:
        properties['purchase_value'] = round(np.random.uniform(20, 300), 2)

    event = {
        'event_id': f'evt_{i}',
        'timestamp': (datetime.now() - timedelta(hours=np.random.randint(0, 72))).isoformat(),
        'user_id': f'user_{np.random.randint(100, 999)}',
        'event_type': np.random.choice(['view', 'click', 'purchase']),
        'metadata': json.dumps(properties)
    }
    events.append(event)

# Create database performance logs
db_logs = pd.DataFrame({
    'timestamp': pd.date_range('2024-01-01', periods=5000, freq='1min'),
    'operation': np.random.choice(['SELECT', 'INSERT', 'UPDATE'], 5000, p=[0.7, 0.2, 0.1]),
    'duration_ms': np.random.lognormal(mean=4, sigma=1, size=5000),
    'table_name': np.random.choice(['users', 'orders', 'products'], 5000),
    'rows_processed': np.random.poisson(lam=25, size=5000),
    'connection_id': np.random.randint(1, 20, 5000)
})

# Create API log data
api_logs = []
for i in range(800):
    log_entry = {
        'timestamp': datetime.now() - timedelta(minutes=np.random.randint(0, 1440)),
        'endpoint': np.random.choice(['/api/users', '/api/orders', '/api/metrics']),
        'status_code': np.random.choice([200, 400, 500], p=[0.8, 0.15, 0.05]),
        'response_time': np.random.exponential(150)
    }
    if log_entry['status_code'] == 200:
        log_entry['payload_size'] = np.random.randint(100, 5000)
    api_logs.append(log_entry)


In [None]:
events_df = pd.DataFrame([{**event, **json.loads(event['metadata'])} for event in events]).drop('metadata', axis=1)
events_df

Unnamed: 0,event_id,timestamp,user_id,event_type,device_type,page_path,session_length,purchase_value
0,evt_0,2025-09-05T10:07:05.428849,user_230,click,tablet,/home,1354,187.62
1,evt_1,2025-09-03T18:07:05.429049,user_376,view,mobile,/products,2451,79.45
2,evt_2,2025-09-04T09:07:05.429173,user_574,purchase,desktop,/products,2360,
3,evt_3,2025-09-05T19:07:05.429277,user_289,click,tablet,/products,2107,
4,evt_4,2025-09-03T18:07:05.429366,user_604,purchase,tablet,/products,622,
...,...,...,...,...,...,...,...,...
995,evt_995,2025-09-04T03:07:05.550387,user_535,view,mobile,/checkout,1236,
996,evt_996,2025-09-03T16:07:05.550488,user_374,purchase,desktop,/home,1267,284.52
997,evt_997,2025-09-03T23:07:05.550587,user_887,view,tablet,/products,762,
998,evt_998,2025-09-03T20:07:05.550669,user_711,purchase,mobile,/checkout,3050,


In [None]:
outliers = db_logs.groupby('operation').apply(lambda x: x[x['duration_ms'] > x['duration_ms'].quantile(0.95)]).reset_index(drop=True)
outliers

  outliers = db_logs.groupby('operation').apply(lambda x: x[x['duration_ms'] > x['duration_ms'].quantile(0.95)]).reset_index(drop=True)


Unnamed: 0,timestamp,operation,duration_ms,table_name,rows_processed,connection_id
0,2024-01-01 04:04:00,INSERT,454.519305,orders,27,3
1,2024-01-01 04:30:00,INSERT,320.470662,orders,25,18
2,2024-01-01 05:11:00,INSERT,442.922905,orders,24,7
3,2024-01-01 06:36:00,INSERT,319.404156,users,25,8
4,2024-01-01 07:03:00,INSERT,668.653003,products,15,4
...,...,...,...,...,...,...
245,2024-01-04 06:17:00,UPDATE,453.737756,products,27,15
246,2024-01-04 07:35:00,UPDATE,411.054132,orders,22,16
247,2024-01-04 08:55:00,UPDATE,647.423038,users,26,14
248,2024-01-04 10:28:00,UPDATE,395.197038,products,29,1


In [None]:
api_response_trends = pd.DataFrame(api_logs).set_index('timestamp').sort_index().groupby('endpoint')['response_time'].rolling('1H').mean().reset_index()
api_response_trends

  api_response_trends = pd.DataFrame(api_logs).set_index('timestamp').sort_index().groupby('endpoint')['response_time'].rolling('1H').mean().reset_index()


Unnamed: 0,endpoint,timestamp,response_time
0,/api/metrics,2025-09-05 09:12:05.650176,73.154751
1,/api/metrics,2025-09-05 09:23:05.632523,44.343489
2,/api/metrics,2025-09-05 09:33:05.616716,126.406599
3,/api/metrics,2025-09-05 09:41:05.592310,129.882460
4,/api/metrics,2025-09-05 09:52:05.642154,122.557410
...,...,...,...
795,/api/users,2025-09-06 08:46:05.596963,141.860881
796,/api/users,2025-09-06 08:46:05.619485,130.593739
797,/api/users,2025-09-06 08:55:05.623587,133.415380
798,/api/users,2025-09-06 08:57:05.585620,137.561812


In [None]:
schema_evolution = pd.DataFrame([{k: type(v).__name__ for k, v in json.loads(event['metadata']).items()} for event in events]).fillna('missing').nunique()
print(schema_evolution)

device_type       1
page_path         1
session_length    1
purchase_value    2
dtype: int64


In [None]:
connection_perf = db_logs.groupby(['operation', 'connection_id']).agg({'duration_ms': ['mean', 'count'], 'rows_processed': ['sum', 'mean']}).round(2)
connection_perf

Unnamed: 0_level_0,Unnamed: 1_level_0,duration_ms,duration_ms,rows_processed,rows_processed
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count,sum,mean
operation,connection_id,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
INSERT,1,98.19,49,1291,26.35
INSERT,2,70.32,62,1537,24.79
INSERT,3,101.66,66,1628,24.67
INSERT,4,120.86,51,1254,24.59
INSERT,5,67.68,40,1001,25.02
INSERT,6,84.94,38,971,25.55
INSERT,7,85.42,52,1299,24.98
INSERT,8,94.83,47,1206,25.66
INSERT,9,95.07,54,1349,24.98
INSERT,10,84.49,60,1535,25.58


In [None]:
hourly_patterns = pd.DataFrame(events).assign(hour=lambda x: pd.to_datetime(x['timestamp']).dt.hour).groupby(['hour', 'event_type']).size().unstack(fill_value=0).div(pd.DataFrame(events).assign(hour=lambda x: pd.to_datetime(x['timestamp']).dt.hour).groupby('hour').size(), axis=0).round(3)
hourly_patterns

event_type,click,purchase,view
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.194,0.29,0.516
1,0.235,0.294,0.471
2,0.157,0.529,0.314
3,0.324,0.441,0.235
4,0.366,0.293,0.341
5,0.295,0.364,0.341
6,0.349,0.302,0.349
7,0.311,0.289,0.4
8,0.333,0.296,0.37
9,0.324,0.324,0.351


In [None]:
error_breakdown = pd.DataFrame(api_logs).groupby(['endpoint', 'status_code']).size().unstack(fill_value=0).div(pd.DataFrame(api_logs).groupby('endpoint').size(), axis=0).round(3)
print(error_breakdown)

status_code     200    400    500
endpoint                         
/api/metrics  0.789  0.151  0.060
/api/orders   0.827  0.140  0.033
/api/users    0.772  0.167  0.061


In [None]:
anomaly_flags = db_logs.sort_values('timestamp').assign(rolling_mean=lambda x: x['duration_ms'].rolling(window=100, min_periods=10).mean()).assign(is_anomaly=lambda x: x['duration_ms'] > 2 * x['rolling_mean'])
anomaly_flags

Unnamed: 0,timestamp,operation,duration_ms,table_name,rows_processed,connection_id,rolling_mean,is_anomaly
0,2024-01-01 00:00:00,SELECT,24.332462,users,29,14,,False
1,2024-01-01 00:01:00,SELECT,42.570106,products,43,2,,False
2,2024-01-01 00:02:00,UPDATE,131.133131,users,26,15,,False
3,2024-01-01 00:03:00,INSERT,74.318827,orders,12,3,,False
4,2024-01-01 00:04:00,SELECT,174.374916,products,33,13,,False
...,...,...,...,...,...,...,...,...
4995,2024-01-04 11:15:00,SELECT,33.548063,products,24,1,118.375177,False
4996,2024-01-04 11:16:00,SELECT,100.455488,users,21,18,117.345192,False
4997,2024-01-04 11:17:00,SELECT,72.276571,products,31,18,112.769809,False
4998,2024-01-04 11:18:00,INSERT,183.055281,orders,15,8,111.222979,False


In [None]:
optimized_df = db_logs.select_dtypes(include=['int', 'float']).apply(lambda x: pd.to_numeric(x, downcast='integer' if x.dtype == 'int64' else 'float')).combine_first(db_logs)
optimized_df

Unnamed: 0,connection_id,duration_ms,operation,rows_processed,table_name,timestamp
0,14,24.332462,SELECT,29,users,2024-01-01 00:00:00
1,2,42.570107,SELECT,43,products,2024-01-01 00:01:00
2,15,131.133133,UPDATE,26,users,2024-01-01 00:02:00
3,3,74.318825,INSERT,12,orders,2024-01-01 00:03:00
4,13,174.374924,SELECT,33,products,2024-01-01 00:04:00
...,...,...,...,...,...,...
4995,1,33.548065,SELECT,24,products,2024-01-04 11:15:00
4996,18,100.455490,SELECT,21,users,2024-01-04 11:16:00
4997,18,72.276573,SELECT,31,products,2024-01-04 11:17:00
4998,8,183.055283,INSERT,15,orders,2024-01-04 11:18:00


In [None]:
pipeline_metrics = pd.DataFrame(events).assign(hour=lambda x: pd.to_datetime(x['timestamp']).dt.hour).groupby('hour').agg({'event_id': 'count', 'user_id': 'nunique', 'event_type': lambda x: (x == 'purchase').mean()}).rename(columns={'event_id': 'total_events', 'user_id': 'unique_users', 'event_type': 'purchase_rate'}).round(3)
pipeline_metrics

Unnamed: 0_level_0,total_events,unique_users,purchase_rate
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,31,30,0.29
1,34,34,0.294
2,51,51,0.529
3,34,33,0.441
4,41,38,0.293
5,44,43,0.364
6,43,43,0.302
7,45,44,0.289
8,54,53,0.296
9,37,37,0.324
