In [9]:
import pandas as pd
import requests
import datetime
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter
from dotenv import load_dotenv
from scipy import stats
from io import StringIO

# --- Load env variables ---
load_dotenv()
STATS_URL = os.getenv("STATS_URL")

# --- Config ---
DATA_DIR = "lotto_data"
os.makedirs(DATA_DIR, exist_ok=True)

# --- Fetch and Save Weekly Data ---
today = datetime.date.today()
filename = os.path.join(DATA_DIR, f"results-{today}.csv")

response = requests.get(STATS_URL)
response.raise_for_status()

# Parse HTML tables with proper StringIO wrapping
tables = pd.read_html(StringIO(response.text))
lotto_df = tables[0]  # Adjust index if needed
lotto_df.to_csv(filename, index=False)

print(f"Saved weekly results to {filename}")

# --- Load All Weekly Data ---
all_data = []
for file in os.listdir(DATA_DIR):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(DATA_DIR, file))
        all_data.append(df)

merged_df = pd.concat(all_data, ignore_index=True)

# --- Preprocess ---
numbers_col = merged_df.columns[0]
merged_df[numbers_col] = merged_df[numbers_col].astype(str)

ball_pool = []
for row in merged_df[numbers_col]:
    balls = [int(b.strip()) for b in row.split() if b.strip().isdigit()]
    ball_pool.extend(balls)

# --- Frequency Analysis ---
ball_counts = Counter(ball_pool)
ball_freq_df = pd.DataFrame(ball_counts.items(), columns=["Ball", "Frequency"]).sort_values(by="Frequency", ascending=False)

print("\nTop 10 most frequent balls:")
print(ball_freq_df.head(10))

# --- Statistical Analysis ---
print("\nStatistical Summary:")
print(f"Total draws analyzed: {len(all_data)}")
print(f"Total balls drawn: {len(ball_pool)}")
print(f"Most frequent ball: {ball_freq_df.iloc[0]['Ball']} (appeared {ball_freq_df.iloc[0]['Frequency']} times)")
print(f"Least frequent ball: {ball_freq_df.iloc[-1]['Ball']} (appeared {ball_freq_df.iloc[-1]['Frequency']} times)")
print(f"Mean frequency: {np.mean(ball_freq_df['Frequency']):.2f}")
print(f"Standard deviation: {np.std(ball_freq_df['Frequency']):.2f}")

# --- Chi-square test ---
chi2, p = stats.chisquare(ball_freq_df['Frequency'])
print(f"\nChi-square test for uniform distribution: p-value = {p:.4f}")
print("The balls do NOT appear uniformly distributed" if p < 0.05 else "The balls appear uniformly distributed")

# --- Visualization 1: Frequency Bar Chart ---
plt.figure(figsize=(15, 8))
sns.barplot(data=ball_freq_df, x="Ball", y="Frequency", palette="viridis")
plt.title("Ball Draw Frequency", fontsize=16)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("frequency_chart.png")
plt.close()

# --- Visualization 2: Heatmap ---
ball_numbers = sorted(ball_freq_df['Ball'])
co_matrix = pd.DataFrame(0, index=ball_numbers, columns=ball_numbers)

for row in merged_df[numbers_col]:
    balls = [int(b) for b in row.split()]
    for i in range(len(balls)):
        for j in range(i+1, len(balls)):
            co_matrix.loc[balls[i], balls[j]] += 1
            co_matrix.loc[balls[j], balls[i]] += 1

plt.figure(figsize=(12, 10))
sns.heatmap(co_matrix, cmap="YlOrRd", annot=True, fmt="d")
plt.title("Ball Co-occurrence", fontsize=16)
plt.tight_layout()
plt.savefig("co_occurrence_heatmap.png")
plt.close()

# --- Visualization 3: Distribution Plot ---
plt.figure(figsize=(12, 6))
sns.histplot(ball_freq_df['Frequency'], bins=15, kde=True)
plt.axvline(np.mean(ball_freq_df['Frequency']), color='r', linestyle='--', 
            label=f'Mean: {np.mean(ball_freq_df["Frequency"]):.1f}')
plt.title("Frequency Distribution", fontsize=16)
plt.legend()
plt.tight_layout()
plt.savefig("distribution_plot.png")
plt.close()

# --- Prediction ---
top_n = 6
predicted = sorted(ball_freq_df.head(top_n)["Ball"].tolist())
cold = sorted(ball_freq_df.tail(top_n)["Ball"].tolist())

print(f"\nPredicted hot numbers: {predicted}")
print(f"Predicted cold numbers: {cold}")

# --- Pair Analysis ---
print("\nTop 5 number pairs:")
print(co_matrix.unstack().sort_values(ascending=False).head(5))

Saved weekly results to lotto_data/results-2025-06-14.csv

Top 10 most frequent balls:
   Ball  Frequency
0     1          1
1     2          1
2     3          1
3     4          1
4     5          1
5     6          1
6     7          1
7     8          1
8     9          1
9    10          1

Statistical Summary:
Total draws analyzed: 1
Total balls drawn: 42
Most frequent ball: 1 (appeared 1 times)
Least frequent ball: 42 (appeared 1 times)
Mean frequency: 1.00
Standard deviation: 0.00

Chi-square test for uniform distribution: p-value = 1.0000
The balls appear uniformly distributed



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=ball_freq_df, x="Ball", y="Frequency", palette="viridis")



Predicted hot numbers: [1, 2, 3, 4, 5, 6]
Predicted cold numbers: [37, 38, 39, 40, 41, 42]

Top 5 number pairs:
1  1    0
   2    0
   3    0
   4    0
   5    0
dtype: int64
