# Importing Libraries

In [141]:
import pandas as pd
import numpy as np
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from collections import Counter

# Loading Data

In [142]:
df = pd.read_csv("results.csv")

# Data Cleaning

In [143]:
def count_unique_digits_with_leading_zeros(number):
    number_str = str(number).zfill(4)  
    return len(set(number_str)) 

def most_frequent_digit(number):
    number_str = str(number).zfill(4)
    digit_counts = {digit: number_str.count(digit) for digit in number_str}
    max_frequency = max(digit_counts.values())
    
    most_frequent = min([digit for digit, count in digit_counts.items() if count == max_frequency])
    
    return int(most_frequent)

# Apply functions
df['distinct_values'] = df['Number'].apply(count_unique_digits_with_leading_zeros)
df['most_frequent_digit'] = df['Number'].apply(most_frequent_digit)

df.loc[df['distinct_values'] == 4, 'most_frequent_digit'] = df['Number'].astype(str).str.zfill(4).str[0].astype(int)

df['Number'] = df['Number'].astype(str).str.zfill(4)

# Extract individual digits as target variables
df['Digit_1'] = df['Number'].str[0].astype(int)
df['Digit_2'] = df['Number'].str[1].astype(int)
df['Digit_3'] = df['Number'].str[2].astype(int)
df['Digit_4'] = df['Number'].str[3].astype(int)

df


Unnamed: 0,Draw Number,Number,Prize,Date,distinct_values,most_frequent_digit,Digit_1,Digit_2,Digit_3,Digit_4
0,5304,4111,1,23/3/25,2,1,4,1,1,1
1,5304,4035,2,23/3/25,4,4,4,0,3,5
2,5304,4379,3,23/3/25,4,4,4,3,7,9
3,5304,0188,4,23/3/25,3,8,0,1,8,8
4,5304,2497,4,23/3/25,4,2,2,4,9,7
...,...,...,...,...,...,...,...,...,...,...
317,5291,6285,5,22/2/25,4,6,6,2,8,5
318,5291,6335,5,22/2/25,3,3,6,3,3,5
319,5291,9099,5,22/2/25,2,9,9,0,9,9
320,5291,9306,5,22/2/25,4,9,9,3,0,6


In [144]:
df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y', dayfirst=True)

# Extract Year, Month, and Day
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Extract Weekday as a number (Monday = 1, Sunday = 7)
df['Weekday'] = df['Date'].dt.weekday + 1  # Adding 1 to make Monday = 1

df

Unnamed: 0,Draw Number,Number,Prize,Date,distinct_values,most_frequent_digit,Digit_1,Digit_2,Digit_3,Digit_4,Year,Month,Day,Weekday
0,5304,4111,1,2025-03-23,2,1,4,1,1,1,2025,3,23,7
1,5304,4035,2,2025-03-23,4,4,4,0,3,5,2025,3,23,7
2,5304,4379,3,2025-03-23,4,4,4,3,7,9,2025,3,23,7
3,5304,0188,4,2025-03-23,3,8,0,1,8,8,2025,3,23,7
4,5304,2497,4,2025-03-23,4,2,2,4,9,7,2025,3,23,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
317,5291,6285,5,2025-02-22,4,6,6,2,8,5,2025,2,22,6
318,5291,6335,5,2025-02-22,3,3,6,3,3,5,2025,2,22,6
319,5291,9099,5,2025-02-22,2,9,9,0,9,9,2025,2,22,6
320,5291,9306,5,2025-02-22,4,9,9,3,0,6,2025,2,22,6


In [145]:
selected_feature = ['Draw Number', 'distinct_values', 'most_frequent_digit', 'Year', 'Month', 'Day', 'Weekday', 'Prize']
target_cols = ['Digit_1', 'Digit_2', 'Digit_3', 'Digit_4']

# Data Splitting

# Split into train, validate and test

In [146]:
train_validate_df, test_df = train_test_split(df, test_size=0.15, random_state=42)

X_test = test_df[selected_feature]
y_test = test_df[target_cols]

print('X test shape:', X_test.shape)
print('y test shape:', y_test.shape)

X test shape: (49, 8)
y test shape: (49, 4)


In [147]:
X = train_validate_df[selected_feature]  # Features
y = train_validate_df[target_cols]  # Target digits

X_train, X_validate, y_train, y_validate = train_test_split(X, y, test_size=0.2, random_state=42)

print('X train shape:', X_train.shape)
print('y train shape:', y_train.shape)

print('X validate shape:', X_validate.shape)
print('y validate shape:', y_validate.shape)

X train shape: (218, 8)
y train shape: (218, 4)
X validate shape: (55, 8)
y validate shape: (55, 4)


# Training the model

## Extra Trees

In [154]:
et_model_config = {
    'n_estimators': 100,
    'max_depth': 20,
    'criterion': 'gini',
    'random_state': 42,
    'bootstrap': True
}

et_model = ExtraTreesClassifier(**et_model_config)
et_multi_model = MultiOutputClassifier(et_model)
et_multi_model.fit(X_train, y_train)

# Get probability predictions for each target digit
y_pred_probability_et = et_multi_model.predict_proba(X_validate)

# Prepare DataFrame to store results
results = []

# Process each row in validation set
for idx in range(len(X_validate)):
    row_results = {}

    # Store actual values and get top 1 prediction for each digit
    pred_1_score = 0
    pred_2_score = 0

    for digit in range(4):  # Iterate over all 4 digit columns
        actual_value = y_validate.iloc[idx, digit]
        row_results[f'Digit_{digit+1}'] = actual_value  # Actual value

        # Get top 2 predictions
        digit_proba = y_pred_probability_et[digit][idx]
        top_2_indices = np.argsort(digit_proba)[-2:][::-1]  # Top 2 predicted digits

        # Store predictions
        row_results[f'Pred_1_Digit_{digit+1}'] = int(top_2_indices[0])
        row_results[f'Pred_2_Digit_{digit+1}'] = int(top_2_indices[1])

        # Assign scores (1 if correct, 0 if incorrect)
        row_results[f'Pred_1_Digit_{digit+1}_Score'] = int(actual_value == top_2_indices[0])
        row_results[f'Pred_2_Digit_{digit+1}_Score'] = int(actual_value == top_2_indices[1])

        # Update total score counters
        pred_1_score += row_results[f'Pred_1_Digit_{digit+1}_Score']
        pred_2_score += row_results[f'Pred_2_Digit_{digit+1}_Score']

    # Compute final scores as percentage (correct digits / 4)
    row_results["pred_1_match_score"] = pred_1_score / 4
    row_results["pred_2_match_score"] = pred_2_score / 4

    results.append(row_results)

# Convert to DataFrame
et_df = pd.DataFrame(results)

# Create full number representation
et_df["number"] = et_df[[f"Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)
et_df["pred_1"] = et_df[[f"Pred_1_Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)
et_df["pred_2"] = et_df[[f"Pred_2_Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)

def digit_frequency_match_percentage(num1, num2):
    """Returns the percentage of digit matches between two numbers based on frequency."""
    counter1 = Counter(str(num1).zfill(4))  # Ensure 4-digit format
    counter2 = Counter(str(num2).zfill(4))
    
    # Count the total number of matching digits based on frequency
    matching_digits = sum(min(counter1[d], counter2[d]) for d in counter1)
    
    return matching_digits / 4  # Normalize to get percentage

# Apply function to compute match percentage
et_df["pred_1_sim_perc"] = et_df.apply(lambda row: digit_frequency_match_percentage(row["number"], row["pred_1"]), axis=1)
et_df["pred_2_sim_perc"] = et_df.apply(lambda row: digit_frequency_match_percentage(row["number"], row["pred_2"]), axis=1)

# Keep only relevant columns
et_df = et_df[["number", "pred_1", "pred_1_match_score", "pred_1_sim_perc",
               "pred_2", "pred_2_match_score", "pred_2_sim_perc"]]

et_df

Unnamed: 0,number,pred_1,pred_1_match_score,pred_1_sim_perc,pred_2,pred_2_match_score,pred_2_sim_perc
0,7162,7053,0.25,0.25,6284,0.0,0.5
1,1399,5887,0.0,0.0,9465,0.0,0.25
2,4720,4674,0.25,0.5,3841,0.0,0.25
3,3136,3651,0.25,0.75,1105,0.25,0.25
4,25,220,0.5,0.75,3001,0.25,0.5
5,2794,2153,0.25,0.25,1096,0.25,0.25
6,7538,7316,0.25,0.5,5647,0.0,0.5
7,7590,7903,0.25,0.75,6058,0.0,0.5
8,5097,5107,0.5,0.75,4292,0.25,0.25
9,9802,9903,0.5,0.5,6034,0.0,0.25


## Random Forest

In [149]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=1337)
rf_multi_model = MultiOutputClassifier(rf_model)

rf_multi_model.fit(X_train, y_train)

# Make Predictions
y_pred_probability_rf = rf_multi_model.predict_proba(X_validate)

# Prepare DataFrame to store results
results = []

# Process each row in validation set
for idx in range(len(X_validate)):
    row_results = {}

    # Store actual values and get top 2 predictions for each digit
    for digit in range(4):  # Iterate over all 4 digit columns
        actual_value = y_validate.iloc[idx, digit]
        row_results[f'Digit_{digit+1}'] = actual_value  # Actual value

        # Get top 2 predictions and their probabilities for the current digit
        digit_proba = y_pred_probability_rf[digit][idx]  # Probabilities for this digit
        top_2_indices = np.argsort(digit_proba)[-2:][::-1]  # Get top 2 class indices
        top_2_probs = np.sort(digit_proba)[-2:][::-1]  # Get top 2 probabilities

        # Store predictions in separate columns
        row_results[f'Pred_1_Digit_{digit+1}'] = int(top_2_indices[0])
        row_results[f'Prob_1_Digit_{digit+1}'] = float(top_2_probs[0])
        row_results[f'Pred_2_Digit_{digit+1}'] = int(top_2_indices[1])
        row_results[f'Prob_2_Digit_{digit+1}'] = float(top_2_probs[1])

    results.append(row_results)

# Convert to DataFrame
rf_df = pd.DataFrame(results)

rf_df["Number"] = rf_df[[f"Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)

rf_df["Predicted_Number_1"] = rf_df[[f"Pred_1_Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)
rf_df["Predicted_Number_1_Prob"] = rf_df[[f"Prob_1_Digit_{i}" for i in range(1, 5)]].mean(axis=1)

rf_df["Predicted_Number_2"] = rf_df[[f"Pred_2_Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)
rf_df["Predicted_Number_2_Prob"] = rf_df[[f"Prob_2_Digit_{i}" for i in range(1, 5)]].mean(axis=1)

def unordered_digit_match(num1, num2):
    return int(Counter(str(num1)) == Counter(str(num2)))  # Compare digit counts

rf_df["Pred_1_Score"] = rf_df.apply(lambda row: unordered_digit_match(row["Number"], row["Predicted_Number_1"]), axis=1)

rf_df["Pred_2_Score"] = rf_df.apply(lambda row: unordered_digit_match(row["Number"], row["Predicted_Number_2"]), axis=1)

rf_df = rf_df[["Number", "Predicted_Number_1", "Predicted_Number_1_Prob", "Pred_1_Score",
                       "Predicted_Number_2", "Predicted_Number_2_Prob", "Pred_2_Score"]]

rf_df

Unnamed: 0,Number,Predicted_Number_1,Predicted_Number_1_Prob,Pred_1_Score,Predicted_Number_2,Predicted_Number_2_Prob,Pred_2_Score
0,7162,7053,0.706667,0,6284,0.189167,0
1,1399,5887,0.557917,0,9499,0.17125,0
2,4720,4674,0.455,0,3841,0.2075,0
3,3136,3651,0.29875,0,1445,0.19,0
4,25,20,0.46875,0,3202,0.24,0
5,2794,2153,0.6975,0,1422,0.08125,0
6,7538,7316,0.3925,0,5944,0.2125,0
7,7590,6903,0.4225,0,7058,0.280458,0
8,5097,4107,0.615,0,6292,0.19625,0
9,9802,6934,0.359167,0,8003,0.209583,0


## XGBoost

In [150]:
xgb_model_config = {
    'objective': "multi:softmax",  # Multi-class classification
    "num_class": 10,  # Number of target classes
    'n_estimators': 100,  # Number of trees (boosting rounds)
    'learning_rate': 0.1,  # Learning rate (eta)
    'max_depth': 5,  # Max depth of trees
    'min_child_weight': 2,  # Minimum sum of instance weight in child
    "eval_metric": "mlogloss",  # Multi-class log loss
    'random_state': 42
}

xgb_model = xgb.XGBClassifier(**{**xgb_model_config})
xgb_multi = MultiOutputClassifier(xgb_model)

xgb_multi.fit(X_train, y_train)

# Make Predictions
y_pred_probability_xgb = xgb_multi.predict_proba(X_validate)

# Prepare DataFrame to store results
results = []

# Process each row in validation set
for idx in range(len(X_validate)):
    row_results = {}

    # Store actual values and get top 2 predictions for each digit
    for digit in range(4):  # Iterate over all 4 digit columns
        actual_value = y_validate.iloc[idx, digit]
        row_results[f'Digit_{digit+1}'] = actual_value  # Actual value

        # Get top 2 predictions and their probabilities for the current digit
        digit_proba = y_pred_probability_xgb[digit][idx]  # Probabilities for this digit
        top_2_indices = np.argsort(digit_proba)[-2:][::-1]  # Get top 2 class indices
        top_2_probs = np.sort(digit_proba)[-2:][::-1]  # Get top 2 probabilities

        # Store predictions in separate columns
        row_results[f'Pred_1_Digit_{digit+1}'] = int(top_2_indices[0])
        row_results[f'Prob_1_Digit_{digit+1}'] = float(top_2_probs[0])
        row_results[f'Pred_2_Digit_{digit+1}'] = int(top_2_indices[1])
        row_results[f'Prob_2_Digit_{digit+1}'] = float(top_2_probs[1])

    results.append(row_results)

# Convert to DataFrame
xgb_df = pd.DataFrame(results)

xgb_df["Number"] = xgb_df[[f"Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)

xgb_df["Predicted_Number_1"] = xgb_df[[f"Pred_1_Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)
xgb_df["Predicted_Number_1_Prob"] = xgb_df[[f"Prob_1_Digit_{i}" for i in range(1, 5)]].mean(axis=1)

xgb_df["Predicted_Number_2"] = xgb_df[[f"Pred_2_Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)
xgb_df["Predicted_Number_2_Prob"] = xgb_df[[f"Prob_2_Digit_{i}" for i in range(1, 5)]].mean(axis=1)

def unordered_digit_match(num1, num2):
    return int(Counter(str(num1)) == Counter(str(num2)))  # Compare digit counts

xgb_df["Pred_1_Score"] = xgb_df.apply(lambda row: unordered_digit_match(row["Number"], row["Predicted_Number_1"]), axis=1)

xgb_df["Pred_2_Score"] = xgb_df.apply(lambda row: unordered_digit_match(row["Number"], row["Predicted_Number_2"]), axis=1)

xgb_df = xgb_df[["Number", "Predicted_Number_1", "Predicted_Number_1_Prob", "Pred_1_Score",
                       "Predicted_Number_2", "Predicted_Number_2_Prob", "Pred_2_Score"]]

xgb_df

Unnamed: 0,Number,Predicted_Number_1,Predicted_Number_1_Prob,Pred_1_Score,Predicted_Number_2,Predicted_Number_2_Prob,Pred_2_Score
0,7162,7253,0.628264,0,8084,0.13447,0
1,1399,9899,0.460378,0,5086,0.235684,0
2,4720,4673,0.489588,0,3014,0.158575,0
3,3136,5603,0.277024,0,3145,0.217461,0
4,25,0,0.674428,0,1228,0.166628,0
5,2794,2153,0.528044,0,1490,0.148518,0
6,7538,7604,0.526999,0,8396,0.13581,0
7,7590,7053,0.614272,0,8908,0.127264,0
8,5097,5197,0.560782,0,3205,0.183324,0
9,9802,9953,0.550757,0,8034,0.173004,0


# Testing model

In [151]:
# Make Predictions on Test Data
y_test_pred = et_multi_model.predict(X_test)

# Convert Predictions to String Format
predicted_numbers = [''.join(map(str, digits)) for digits in y_test_pred]

# Get Actual Numbers from Test Data
actual_numbers = [''.join(map(str, row)) for row in y_test.values]

# Compare Predictions with Actual Values
correct_predictions = sum([1 if pred == actual else 0 for pred, actual in zip(predicted_numbers, actual_numbers)])
total_predictions = len(test_df)

# Calculate Accuracy
accuracy = correct_predictions / total_predictions * 100

# Display Results
print(f"Predicted Numbers: {predicted_numbers}")
print(f"Actual Numbers:    {actual_numbers}")
print(f"Correct Predictions: {correct_predictions} out of {total_predictions}")
print(f"Lottery Number Accuracy: {accuracy:.2f}%")

Predicted Numbers: ['0202', '3801', '0523', '5792', '9326', '1312', '1403', '8251', '8468', '7796', '3277', '2966', '2415', '0956', '2491', '1874', '6057', '6966', '2672', '2497', '6234', '6903', '7058', '9605', '8407', '9075', '7903', '9586', '6207', '9953', '3052', '7316', '2402', '5802', '0951', '6673', '6445', '7053', '3834', '0478', '6703', '6684', '9336', '6302', '6131', '2610', '1186', '4847', '1695']
Actual Numbers:    ['8700', '3972', '0505', '5310', '9014', '0603', '0567', '8376', '8425', '9848', '9757', '6906', '2607', '0876', '2387', '2718', '8123', '5676', '2512', '2540', '6261', '6913', '9325', '8471', '7325', '9128', '7963', '4784', '7630', '9746', '3659', '7890', '3044', '5247', '1950', '7669', '3855', '7940', '3485', '0638', '5917', '6056', '3433', '6285', '8181', '2041', '0409', '4260', '1597']
Correct Predictions: 0 out of 49
Lottery Number Accuracy: 0.00%


In [152]:
# Make Predictions on Test Data
y_test_pred = et_multi_model.predict_proba(X_test)

# Prepare DataFrame to store results
results = []

# Process each row in validation set
for idx in range(len(X_test)):
    row_results = {}

    # Store actual values and get top 2 predictions for each digit
    for digit in range(4):  # Iterate over all 4 digit columns
        actual_value = y_test.iloc[idx, digit]
        row_results[f'Digit_{digit+1}'] = actual_value  # Actual value

        # Get top 2 predictions and their probabilities for the current digit
        digit_proba = y_test_pred[digit][idx]  # Probabilities for this digit
        top_2_indices = np.argsort(digit_proba)[-2:][::-1]  # Get top 2 class indices
        top_2_probs = np.sort(digit_proba)[-2:][::-1]  # Get top 2 probabilities

        # Store predictions in separate columns
        row_results[f'Pred_1_Digit_{digit+1}'] = int(top_2_indices[0])
        row_results[f'Prob_1_Digit_{digit+1}'] = float(top_2_probs[0])
        row_results[f'Pred_2_Digit_{digit+1}'] = int(top_2_indices[1])
        row_results[f'Prob_2_Digit_{digit+1}'] = float(top_2_probs[1])

    results.append(row_results)

# Convert to DataFrame
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Digit_1,Pred_1_Digit_1,Prob_1_Digit_1,Pred_2_Digit_1,Prob_2_Digit_1,Digit_2,Pred_1_Digit_2,Prob_1_Digit_2,Pred_2_Digit_2,Prob_2_Digit_2,Digit_3,Pred_1_Digit_3,Prob_1_Digit_3,Pred_2_Digit_3,Prob_2_Digit_3,Digit_4,Pred_1_Digit_4,Prob_1_Digit_4,Pred_2_Digit_4,Prob_2_Digit_4
0,8,0,0.45,1,0.38,7,2,0.505,9,0.17,0,0,0.44,4,0.12,0,2,0.42,8,0.18
1,3,3,0.97,5,0.02,9,8,0.474611,1,0.435389,7,0,0.464611,6,0.435389,2,1,0.464611,2,0.445389
2,0,0,0.33,2,0.26,5,5,0.48,0,0.13,0,2,0.46,4,0.13,5,3,0.235,5,0.185
3,5,5,0.76,6,0.1275,3,7,0.66,8,0.13,1,9,0.71,4,0.16,0,2,0.66,3,0.11
4,9,9,0.49,7,0.21,0,3,0.34,0,0.23,1,2,0.37,7,0.27,4,6,0.365,5,0.2
5,0,1,0.33,3,0.31,6,3,0.46,8,0.12,0,1,0.42,0,0.13,3,2,0.36,0,0.26
6,0,1,0.54,0,0.36,5,4,0.572857,9,0.18,6,0,0.74,5,0.123,7,3,0.565,4,0.171
7,8,8,0.395,9,0.315,3,2,0.49,4,0.2,7,5,0.4,3,0.32781,6,1,0.270714,6,0.235952
8,8,8,0.325,7,0.23,4,4,0.32,3,0.13,2,6,0.3,9,0.171667,5,8,0.31,3,0.13
9,9,7,0.35,9,0.25,8,7,0.478333,3,0.12,4,9,0.5,3,0.1325,8,6,0.52,9,0.13


In [153]:
results_df["Number"] = results_df[[f"Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)

results_df["Predicted_Number_1"] = results_df[[f"Pred_1_Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)
results_df["Predicted_Number_1_Prob"] = results_df[[f"Prob_1_Digit_{i}" for i in range(1, 5)]].mean(axis=1)

results_df["Predicted_Number_2"] = results_df[[f"Pred_2_Digit_{i}" for i in range(1, 5)]].astype(str).agg("".join, axis=1)
results_df["Predicted_Number_2_Prob"] = results_df[[f"Prob_2_Digit_{i}" for i in range(1, 5)]].mean(axis=1)

def unordered_digit_match(num1, num2):
    return int(Counter(str(num1)) == Counter(str(num2)))  # Compare digit counts

results_df["Pred_1_Score"] = results_df.apply(lambda row: unordered_digit_match(row["Number"], row["Predicted_Number_1"]), axis=1)

results_df["Pred_2_Score"] = results_df.apply(lambda row: unordered_digit_match(row["Number"], row["Predicted_Number_2"]), axis=1)

final_df = results_df[["Number", "Predicted_Number_1", "Predicted_Number_1_Prob", "Pred_1_Score",
                       "Predicted_Number_2", "Predicted_Number_2_Prob", "Pred_2_Score"]]

final_df

Unnamed: 0,Number,Predicted_Number_1,Predicted_Number_1_Prob,Pred_1_Score,Predicted_Number_2,Predicted_Number_2_Prob,Pred_2_Score
0,8700,202,0.45375,0,1948,0.2125,0
1,3972,3801,0.593458,0,5162,0.334042,0
2,505,523,0.37625,0,2045,0.17625,0
3,5310,5792,0.6975,0,6843,0.131875,0
4,9014,9326,0.39125,0,7075,0.2275,0
5,603,1312,0.3925,0,3800,0.205,0
6,567,1403,0.604464,0,954,0.2085,0
7,8376,8251,0.388929,0,9436,0.26969,0
8,8425,8468,0.31375,0,7393,0.165417,0
9,9848,7796,0.462083,0,9339,0.158125,0
