In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# --- Step 1: Load CSV safely (handle commas in penalty_reason) ---
with open("lawsuit_data_leakages.csv", "r", encoding="utf-8") as f:
    lines = f.readlines()

parsed_rows = []
for line in lines[1:]:  # skip header
    parts = line.strip().split(",", 3)  # split only first 3 commas
    if len(parts) < 4:
        parts += [""]  # pad missing fields
    parsed_rows.append(parts)

df = pd.DataFrame(parsed_rows, columns=["case_name","case_url","total_penalty","penalty_reason"])

# Convert total_penalty to numeric
df['total_penalty'] = pd.to_numeric(df['total_penalty'], errors='coerce').fillna(0)

# Keep only penalties > 0
df = df[df['total_penalty'] > 0].reset_index(drop=True)

# --- Step 2: Create dummy variables for case types ---
df['is_automotive_manufacturing'] = df['penalty_reason'].str.contains("automotive manufacturing", case=False).astype(int)
df['is_production'] = df['penalty_reason'].str.contains("production", case=False).astype(int)
df['is_consumer'] = df['penalty_reason'].str.contains("consumer", case=False).astype(int)

# --- Step 3: Robust regression ---
X = df[['is_automotive_manufacturing','is_production','is_consumer']]
y = df['total_penalty']
X = sm.add_constant(X) 

robust_model = sm.RLM(y, X, M=sm.robust.norms.HuberT()).fit()
print("=== Robust Regression ===")
print(robust_model.summary())

# --- Step 4: Monte Carlo simulation for mean total penalty ---
pred = robust_model.predict(X)
residuals = y - pred
n_sim = 10000
sim_means = []

for _ in range(n_sim):
    sampled_res = np.random.choice(residuals, size=len(y), replace=True)
    sim_total = pred + sampled_res
    sim_means.append(sim_total.mean())

sim_means = np.array(sim_means)

print("\n=== Monte Carlo Simulation ===")
print(f"Mean penalty per case: ${sim_means.mean():,.0f}")
print(f"Median penalty per case: ${np.median(sim_means):,.0f}")
print(f"95% CI: ${np.percentile(sim_means,2.5):,.0f} - ${np.percentile(sim_means,97.5):,.0f}")

# --- Step 5: Simple statistical tests ---
# Compare penalties for automotive vs non-automotive cases
auto_penalties = df[df['is_automotive_manufacturing']==1]['total_penalty']
non_auto_penalties = df[df['is_automotive_manufacturing']==0]['total_penalty']
t_stat, p_val = stats.ttest_ind(auto_penalties, non_auto_penalties, equal_var=False)
print("\n=== T-test: Automotive vs Non-Automotive ===")
print(f"T-statistic: {t_stat:.3f}, P-value: {p_val:.3f}")

# --- Step 6: Visualizations ---
sns.set(style="whitegrid")

# 1. Histogram of total penalties
plt.figure(figsize=(8,5))
sns.histplot(df['total_penalty'], bins=30, kde=True, color="skyblue")
plt.title("Distribution of Total Penalties")
plt.xlabel("Total Penalty ($)")
plt.ylabel("Count")
plt.show()

# 2. Boxplot by case type (log scale)
plt.figure(figsize=(8,5))
df_melt = df.melt(
    id_vars='total_penalty',
    value_vars=['is_automotive_manufacturing','is_production','is_consumer'],
    var_name='Case_Type',
    value_name='Present'
)
df_melt = df_melt[df_melt['Present']==1]
sns.boxplot(x='Case_Type', y='total_penalty', data=df_melt, palette="Set2")
plt.yscale('log')
plt.title("Total Penalties by Case Type (Log Scale)")
plt.ylabel("Total Penalty ($, log scale)")
plt.show()

# 3. Monte Carlo distribution
plt.figure(figsize=(8,5))
sns.histplot(sim_means, bins=50, kde=True, color="orange")
plt.title("Monte Carlo Simulation: Distribution of Mean Total Penalties")
plt.xlabel("Mean Total Penalty ($)")
plt.ylabel("Frequency")
plt.show()
