## Dataset Generation for Section 5.1 Experiments

This notebook generates the datasets used for training the experiments in **Section 5.1** for predicting $a_p \bmod 2$ given the sequence $(a_q)_{q \ne p,\, q < 100}$.

### Data Source

The required data is loaded from the file [`ecq6.txt`](https://zenodo.org/records/15777475), which contains the sequence $(a_q)_{q < 100}$ for elliptic curves $E$ with conductor $N(E) < 10^7$.

In [None]:
#Imports and basic functions
import pandas as pd
import numpy as np

def encode_integer(val, base=1000, digit_sep=" "):
    if val == 0:
        return '+ 0'
    sgn = '+' if val >= 0 else '-'
    val = abs(val)
    r = []
    while val > 0:
        r.append(str(val % base))
        val = val//base
    r.append(sgn)
    r.reverse()
    return digit_sep.join(r)

def encode_integer_array(arr, base=1000, digit_sep=" "):
    return ' '.join([encode_integer(x, base, digit_sep) for x in arr])
    
def encode_pinteger(val, p):
    return '+ '+str(p)+str(val)

In [None]:
from sklearn.model_selection import train_test_split

# fix a random seed for reproducibility
seed = 42

# Define the list of primes for the column names
ps = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]

# For each prime p, generate the dataset to predict a_p mod 2 
for i, p in enumerate(ps):
    print(f"Prime is {p}")
    column_names =  ['a_' + str(q) for q in ps]

    # Prepare to read the data
    data = []

    # Read from the file
    with open(r"ecq6.txt", 'r') as file:
        for line in file:
            parts = line.strip().split(':')
            data.append(list(map(int, parts[7].strip('[]').split(',')))[:25])

    # Create main dataframe df
    df = pd.DataFrame(data, columns=column_names)
    df.dropna(inplace=True)
    # encode all columns if not p 
    for q in ps:
        if q != p:
            df['a_' + str(q)] = df['a_' + str(q)].apply(lambda x: encode_integer(x % 2))
        if q == p:
            df['a_' + str(q)] = df['a_' + str(q)].apply(lambda x: encode_integer(x))

    # input is all columns except the ith one
    df['input'] = f"V{len(ps)-1} " + df.drop(columns=['a_' + str(p)]).astype(str).agg(' '.join, axis=1)
    df['output'] = df['a_'+str(p)]
    df = df[['input', 'output']]
    
    # Save the dataframe to a text file
    # first create the directory if it does not exist
    import os
    directory = f"raw_aps_but_{p}_to_a{p}_mod_2"
    if not os.path.exists(directory):
        os.makedirs(directory)
    df.to_csv(f"{directory}/raw_aps_but_{p}_to_a{p}_mod_2.txt", sep='\t', index=False, header=False)
    # train text split it 
    df_train, df_test = train_test_split(df, test_size=10000, random_state=seed, shuffle=True)
    df_train = df_train[:2000000]
    df_train.to_csv(f"{directory}/raw_aps_but_{p}_to_a{p}_mod_2_train.txt", sep='\t', index=False, header=False)
    df_test.to_csv(f"{directory}/raw_aps_but_{p}_to_a{p}_mod_2_test.txt", sep='\t', index=False, header=False)

    # Delete intermediate dataframes 
    del df
    del df_train
    del df_test
    break