In [16]:
!rm -rf alphalib
!git clone https://github.com/alpha2phi-platform/alphalib.git

Cloning into 'alphalib'...
remote: Enumerating objects: 1769, done.[K
remote: Counting objects: 100% (213/213), done.[K
remote: Compressing objects: 100% (151/151), done.[K
remote: Total 1769 (delta 129), reused 134 (delta 60), pack-reused 1556[K
Receiving objects: 100% (1769/1769), 231.17 MiB | 20.42 MiB/s, done.
Resolving deltas: 100% (1102/1102), done.


In [19]:
import pandas as pd
import numpy as np

# Create a DataFrame from the given data
df = pd.read_csv("alphalib/data/time_series_data.csv")

# Convert DrawDate to datetime format
df['DrawDate'] = pd.to_datetime(df['DrawDate'], format='%Y%m%d')

# Extract year, month, and day as separate features
df['Year'] = df['DrawDate'].dt.year
df['Month'] = df['DrawDate'].dt.month
df['Day'] = df['DrawDate'].dt.day

# Calculate the difference between the drawn numbers
df['DrawnNo1_2_Diff'] = df['DrawnNo1'] - df['DrawnNo2']
df['DrawnNo1_3_Diff'] = df['DrawnNo1'] - df['DrawnNo3']
df['DrawnNo1_4_Diff'] = df['DrawnNo1'] - df['DrawnNo4']
df['DrawnNo1_5_Diff'] = df['DrawnNo1'] - df['DrawnNo5']
df['DrawnNo1_6_Diff'] = df['DrawnNo1'] - df['DrawnNo6']

# Calculate the sum, mean, min, and max of the drawn numbers
df['DrawnNo_Sum'] = df[['DrawnNo1', 'DrawnNo2', 'DrawnNo3', 'DrawnNo4', 'DrawnNo5', 'DrawnNo6']].sum(axis=1)
df['DrawnNo_Mean'] = df[['DrawnNo1', 'DrawnNo2', 'DrawnNo3', 'DrawnNo4', 'DrawnNo5', 'DrawnNo6']].mean(axis=1)
df['DrawnNo_Min'] = df[['DrawnNo1', 'DrawnNo2', 'DrawnNo3', 'DrawnNo4', 'DrawnNo5', 'DrawnNo6']].min(axis=1)
df['DrawnNo_Max'] = df[['DrawnNo1', 'DrawnNo2', 'DrawnNo3', 'DrawnNo4', 'DrawnNo5', 'DrawnNo6']].max(axis=1)

# Calculate the frequency count of each drawn number
# Create columns for frequency count of each drawn number
for i in range(1, 56):
    col_name = f'DrawnNo{str(i)}_Count'
    df[col_name] = (df.iloc[:, 1:7] == i).sum(axis=1)

# Calculate the consecutive differences between drawn numbers
for i in range(1, 6):
    col_name = f'DrawnNo{i}_{i+1}_Diff'
    df[col_name] = df[f'DrawnNo{i}'] - df[f'DrawnNo{i+1}']

# Calculate the cumulative sum of drawn numbers
df['DrawnNo1_CumSum'] = df['DrawnNo1'].cumsum()
df['DrawnNo2_CumSum'] = df['DrawnNo2'].cumsum()
df['DrawnNo3_CumSum'] = df['DrawnNo3'].cumsum()
df['DrawnNo4_CumSum'] = df['DrawnNo4'].cumsum()
df['DrawnNo5_CumSum'] = df['DrawnNo5'].cumsum()
df['DrawnNo6_CumSum'] = df['DrawnNo6'].cumsum()

# Calculate the differences between cumulative sums of drawn numbers
for i in range(1, 6):
    col_name = f'DrawnNo{i}_CumSum_Diff'
    df[col_name] = df[f'DrawnNo{i}_CumSum'] - df[f'DrawnNo{i+1}_CumSum']

# Calculate the product of drawn numbers
df['DrawnNo1_2_Product'] = df['DrawnNo1'] * df['DrawnNo2']
df['DrawnNo1_3_Product'] = df['DrawnNo1'] * df['DrawnNo3']
df['DrawnNo1_4_Product'] = df['DrawnNo1'] * df['DrawnNo4']
df['DrawnNo1_5_Product'] = df['DrawnNo1'] * df['DrawnNo5']
df['DrawnNo1_6_Product'] = df['DrawnNo1'] * df['DrawnNo6']

# Drop the original DrawnNo columns
df = df.drop(columns=['DrawnNo1', 'DrawnNo2', 'DrawnNo3', 'DrawnNo4', 'DrawnNo5', 'DrawnNo6'])

# Display the final DataFrame
display(df.head())

Unnamed: 0,DrawDate,Year,Month,Day,DrawnNo1_2_Diff,DrawnNo1_3_Diff,DrawnNo1_4_Diff,DrawnNo1_5_Diff,DrawnNo1_6_Diff,DrawnNo_Sum,...,DrawnNo1_CumSum_Diff,DrawnNo2_CumSum_Diff,DrawnNo3_CumSum_Diff,DrawnNo4_CumSum_Diff,DrawnNo5_CumSum_Diff,DrawnNo1_2_Product,DrawnNo1_3_Product,DrawnNo1_4_Product,DrawnNo1_5_Product,DrawnNo1_6_Product
0,2009-10-31,2009,10,31,-3,-7,-13,-32,-33,190,...,-3,-4,-6,-19,-1,340,408,510,833,850
1,2009-11-01,2009,11,1,-4,-17,-22,-49,-50,148,...,-7,-17,-11,-46,-2,5,18,23,50,51
2,2009-11-03,2009,11,3,-4,-8,-10,-43,-50,145,...,-11,-21,-13,-79,-9,45,65,75,240,275
3,2009-11-04,2009,11,4,-19,-27,-32,-38,-41,211,...,-30,-29,-18,-85,-12,252,324,369,423,450
4,2009-11-07,2009,11,7,-2,-7,-10,-16,-21,158,...,-32,-34,-21,-91,-17,323,408,459,561,646


In [14]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("alphalib/data/time_series_data.csv")

# Convert DrawDate column to datetime format
df["DrawDate"] = pd.to_datetime(df["DrawDate"], format="%Y%m%d")

# Sort data by date
df.sort_values(by="DrawDate", inplace=True)

print(df.head())

# Extract features and target variable
X = df["DrawDate"]
y = df[["DrawnNo1", "DrawnNo2", "DrawnNo3", "DrawnNo4", "DrawnNo5", "DrawnNo6"]]

# Split data into train and test sets
split_index = int(len(df) * 0.8)
train_data = y.iloc[:split_index]
test_data = y.iloc[split_index:]

# Scale the data using MinMaxScaler
scaler = MinMaxScaler()
train_data_scaled = scaler.fit_transform(train_data)
test_data_scaled = scaler.transform(test_data)


# Define function to create LSTM model
def create_model(n_inputs, n_outputs):
    model = Sequential()
    model.add(LSTM(64, input_shape=(None, n_inputs)))
    model.add(Dense(n_outputs, activation="linear"))
    return model


# Define function to prepare data for LSTM model
def prepare_data(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        X.append(data[i : i + n_steps])
        y.append(data[i + n_steps])
    X = np.array(X)
    y = np.array(y)
    return X, y


# Define parameters for LSTM model
n_steps = 3
n_inputs = 6
n_outputs = 6

# Prepare data for LSTM model
X_train, y_train = prepare_data(train_data_scaled, n_steps)
X_test, y_test = prepare_data(test_data_scaled, n_steps)

# Reshape input data for LSTM model
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], n_inputs))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], n_inputs))

# Create LSTM model
model = create_model(n_inputs, n_outputs)

# Compile model
model.compile(optimizer="adam", loss="mse")

# Fit the model to the training data
model.fit(X_train, y_train, epochs=200, batch_size=32, verbose=0)

# Predict the next 5 numbers for each date in the test data
predicted_data_scaled = model.predict(X_test)
predicted_data = scaler.inverse_transform(predicted_data_scaled)

# Convert predicted data to dataframe
predicted_df = pd.DataFrame(predicted_data, columns=y.columns)

# Get the last date in the train data
last_date = df.iloc[split_index - 1]["DrawDate"]

# Generate dates for the next 5 days
next_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=5)

# Print the predicted numbers for the next 5 dates
print("Predicted numbers for the next 5 dates:")
for i, date in enumerate(next_dates):
    print("Date: ", date.strftime("%Y%m%d"))
    print("Numbers: ", predicted_df.iloc[i].values)
    print()


    DrawDate  DrawnNo1  DrawnNo2  DrawnNo3  DrawnNo4  DrawnNo5  DrawnNo6
0 2009-10-31        17        20        24        30        49        50
1 2009-11-01         1         5        18        23        50        51
2 2009-11-03         5         9        13        15        48        55
3 2009-11-04         9        28        36        41        47        50
4 2009-11-07        17        19        24        27        33        38
Predicted numbers for the next 5 dates:
Date:  20200130
Numbers:  [ 9.067684 15.992386 25.171766 32.084656 40.22879  49.30197 ]

Date:  20200131
Numbers:  [ 8.344676 15.795557 24.206947 32.4729   40.80946  48.708317]

Date:  20200201
Numbers:  [ 7.894725 15.037336 24.675018 32.372482 40.700195 47.88097 ]

Date:  20200202
Numbers:  [ 9.376757 18.193699 26.47369  34.56417  41.34014  48.50751 ]

Date:  20200203
Numbers:  [ 8.157286 14.28464  23.129143 30.49098  38.974888 48.39154 ]

