<a href="https://colab.research.google.com/github/YonggunJung/colab/blob/main/1202LottoGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# save as lotto_predict.py and run: python lotto_predict.py
import pandas as pd
import numpy as np
from itertools import combinations
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split

# ---------- 설정 ----------
CSV_FILE = "/content/drive/MyDrive/Colab Notebooks/로또/data/lotto2.csv"   # 각 행: 6개의 숫자 (쉼표 구분). 헤더 없어도 됨.
NUM_NUMBERS = 6
MAX_NUMBER = 45
CANDIDATES_TO_PRINT = 10

# ---------- 데이터 로드 ----------
df = pd.read_csv(CSV_FILE, header=None)
# 보정: 각 row를 정렬 (선택) — 필요시 제거 가능
rows = df.values.astype(int)
rows = np.sort(rows, axis=1)

# ---------- 기초통계: 숫자 빈도 & 쌍 빈도 ----------
counts = np.zeros(MAX_NUMBER+1, dtype=int)
pair_counts = np.zeros((MAX_NUMBER+1, MAX_NUMBER+1), dtype=int)

for r in rows:
    for a in r:
        counts[a] += 1
    for a,b in combinations(r,2):
        pair_counts[a,b] += 1
        pair_counts[b,a] += 1

freq_df = pd.DataFrame({
    "number": np.arange(1, MAX_NUMBER+1),
    "count": counts[1:]
}).sort_values("count", ascending=False)

print("Top 10 frequent numbers:")
print(freq_df.head(10).to_string(index=False))

# ---------- 머신러닝 준비: 입력 피쳐 생성 ----------
# 간단한 아이디어: 각 회차를 one-hot 벡터(1..45)로 표현.
# 그리고 모델은 t -> predict t+1 (즉, X = one-hot of round i, y = one-hot of round i+1)
def row_to_onehot(row):
    v = np.zeros(MAX_NUMBER, dtype=int)
    v[np.array(row)-1] = 1
    return v

X = []
Y = []
for i in range(len(rows)-1):
    X.append(row_to_onehot(rows[i]))
    Y.append(row_to_onehot(rows[i+1]))
X = np.vstack(X)
Y = np.vstack(Y)

# train/test split (단순)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# One-vs-Rest RandomForest (multi-label)
clf = OneVsRestClassifier(RandomForestClassifier(n_estimators=200, random_state=42))
clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
print(f"Model accuracy (subset match fraction): {score:.4f}")

# 마지막 회차를 사용해 "다음" 예측
last_onehot = row_to_onehot(rows[-1]).reshape(1,-1)
probs = np.array([est.predict_proba(last_onehot)[:,1] if hasattr(est, "predict_proba") else est.predict(last_onehot)
                  for est in clf.estimators_]).reshape(-1)
# some sklearn versions return shape differently; ensure length == MAX_NUMBER
if probs.shape[0] != MAX_NUMBER:
    # try alternate extraction
    probs = np.array([est.predict(last_onehot)[0] for est in clf.estimators_])

# 번호별 확률
num_probs = pd.DataFrame({
    "number": np.arange(1, MAX_NUMBER+1),
    "prob": probs
}).sort_values("prob", ascending=False)

print("\nTop 10 numbers by model-predicted probability:")
print(num_probs.head(10).to_string(index=False))

# ---------- 후보 생성 방법 ----------
# 방법 A: 상위 6개
top6_simple = list(num_probs.head(NUM_NUMBERS)["number"].astype(int))
print("\nMethod A (top-6 by prob):", top6_simple)

# 방법 B: 확률 합 + 쌍 가중치를 이용한 탐색 (간단한 랜덤 서치 + 그리디)
pair_weight = 0.3  # 쌍 동시출현 점수 가중치 (튜닝 가능)

def combo_score(combo):
    # sum of individual probs + pair_weight * sum of pair co-occurrence normalized
    p_sum = num_probs.set_index("number").loc[list(combo),"prob"].sum()
    # pair score: sum of historical pair_counts among chosen numbers
    pc = 0
    for a,b in combinations(combo,2):
        pc += pair_counts[a,b]
    # normalize pair count by max possible
    normalized_pair = pc / (len(rows) + 1e-9)
    return p_sum + pair_weight * normalized_pair

# 탐색: 상위 N 후보 숫자에서 조합을 만들고 점수 상위 조합 선택
topN = 15
candidates = list(num_probs.head(topN)["number"].astype(int))
combos = list(combinations(candidates, NUM_NUMBERS))
scores = [(combo, combo_score(combo)) for combo in combos]
scores.sort(key=lambda x: x[1], reverse=True)

print(f"\nMethod B top {CANDIDATES_TO_PRINT} combos (from top{topN} search):")
for combo, sc in scores[:CANDIDATES_TO_PRINT]:
    print(combo, f"score={sc:.4f}")

# 결과 저장
out_df = pd.DataFrame([{"method":"A_top6", "numbers":",".join(map(str,top6_simple)), "score": None}] +
                      [{"method":"B_search", "numbers": ",".join(map(str,c)), "score": s} for c,s in scores[:CANDIDATES_TO_PRINT]])
out_df.to_csv("predicted_candidates.csv", index=False)
print("\nSaved predicted_candidates.csv")


Top 10 frequent numbers:
 number  count
     34    181
     12    176
     13    174
     27    173
     33    171
     18    171
     40    170
     45    170
     14    169
     37    169
Model accuracy (subset match fraction): 0.0000

Top 10 numbers by model-predicted probability:
 number  prob
      1 0.410
     29 0.295
     44 0.285
     13 0.240
      3 0.225
     41 0.215
     22 0.210
      2 0.200
     26 0.180
     12 0.180

Method A (top-6 by prob): [1, 29, 44, 13, 3, 41]

Method B top 10 combos (from top15 search):
(1, 29, 44, 13, 3, 41) score=1.7424
(1, 29, 44, 13, 3, 22) score=1.7359
(1, 29, 44, 13, 3, 2) score=1.7296
(1, 29, 44, 13, 41, 22) score=1.7171
(1, 29, 44, 13, 3, 12) score=1.7139
(1, 29, 44, 13, 41, 2) score=1.7134
(1, 29, 44, 13, 3, 33) score=1.7091
(1, 29, 44, 13, 3, 26) score=1.7089
(1, 29, 44, 13, 3, 24) score=1.7084
(1, 29, 44, 13, 22, 2) score=1.7059

Saved predicted_candidates.csv
