## Dataset Generation for Section 5.2 Experiments

This notebook generates the datasets used for training the experiments in **Section 5.2** for predicting $a_p \bmod 2$ given the sequence $(a_q \bmod 2)_{q \ne p,\, q < 100}$ and removing duplicate rows.

### Data Source

The required data is loaded from the file [`ECQ7apmod2_1e3_unique.txt`](https://zenodo.org/records/15660733), which contains the sequence $(a_q \bmod 2)_{q < 100}$ for elliptic curves $E$ with conductor $N(E) < 10^7$.


In [None]:
#Imports and basic functions
import pandas as pd
import numpy as np

def encode_integer(val, base=1000, digit_sep=" "):
    if val == 0:
        return '+ 0'
    sgn = '+' if val >= 0 else '-'
    val = abs(val)
    r = []
    while val > 0:
        r.append(str(val % base))
        val = val//base
    r.append(sgn)
    r.reverse()
    return digit_sep.join(r)
    
def encode_pinteger(val, p):
    return '+ '+str(p)+str(val)

In [None]:
# Define the list of primes for the column names
ps = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97]


# For each prime p, generate the dataset to predict a_p mod 2 
for p in ps:
    print(f"Prime is {p}")
    column_names =  ['a_' + str(q) for q in ps] + ['conductor', 'rank']

    # Prepare to read the data
    data = []

    # Read from the file
    with open('ECQ7apmod2_1e3_unique.txt', 'r') as file:
        for line in file:
            parts = line.strip().split(':')
            # Extract data
            if parts[0]=='':
                a=None
            else:
                a = int(parts[0])
            if parts[4]=='':
                d=None
            else:
                d = int(parts[4])
            if a % p !=0:
                L3 = list(map(int, parts[7].strip('[]').split(',')))  
                # Append to data list
                data.append(L3+[a,d])

    # Create main dataframe df
    df = pd.DataFrame(data, columns=column_names)
    df.dropna(inplace=True)
    df['rank']=df['rank'].apply(lambda x: int(x))
    df['cond10'] = np.floor(np.log10(df['conductor']))
    
    
    for q in ps:
        if q !=p:
            df['a_'+str(q)] = df['a_'+str(q)].apply(lambda x: encode_integer(x)) 
        if q ==p:
            df['a_'+str(q)] = df['a_'+str(q)].apply(lambda x: x)
    df["data_type"]="V"+str(len(ps)-1)

    #Balance the dataframe
    dfc0 = df[df['a_'+str(p)] == 0].sample(df['a_'+str(p)].value_counts().min(), random_state=42)
    dfc1 = df[df['a_'+str(p)] == 1]

    dfc = pd.concat([dfc0, dfc1], axis=0)
    dfc = dfc.sample(frac=1, random_state=42).reset_index(drop=True)

    # Create the test set
    # Sample 5000 entries for each specified log value of conductor

    test_set_cond4_aq0 = dfc[(dfc['cond10'] == 4.0)& (dfc['a_'+str(p)] == 0)].sample(n=2500, random_state=42)
    test_set_cond4_aq1 = dfc[(dfc['cond10'] == 4.0)& (dfc['a_'+str(p)] == 1)].sample(n=2500, random_state=42)

    test_set_cond5_aq0 = dfc[(dfc['cond10'] == 5.0)& (dfc['a_'+str(p)] == 0)].sample(n=2500, random_state=42)
    test_set_cond5_aq1 = dfc[(dfc['cond10'] == 5.0)& (dfc['a_'+str(p)] == 1)].sample(n=2500, random_state=42)

    test_set_cond6_aq0 = dfc[(dfc['cond10'] == 6.0)& (dfc['a_'+str(p)] == 0)].sample(n=2500, random_state=42)
    test_set_cond6_aq1 = dfc[(dfc['cond10'] == 6.0)& (dfc['a_'+str(p)] == 1)].sample(n=2500, random_state=42)
    # Concatenate to form the test set
    df_test_cond = pd.concat([test_set_cond4_aq0, test_set_cond4_aq1,test_set_cond5_aq0, test_set_cond5_aq1, test_set_cond6_aq0, test_set_cond6_aq1])



    # Create the training set by excluding the indices used in the test set
    df_train = dfc.drop(df_test_cond.index)
    
    # Create the data files
    df_train1=df_train[['data_type']+['a_'+str(q) for q in ps if p !=q]+['a_'+str(p)]]
    df_test_cond1=df_test_cond[['data_type']+['a_'+str(q) for q in ps if q != p]+['a_'+str(p)]]
    
    print(df_train1)
    # Training dataset
    dftoint_train = pd.DataFrame()
    dftoint_train['input'] =  df_train1.iloc[:, :-1].agg(' '.join, axis=1)
    dftoint_train['output'] = df_train1['a_'+str(p)]
    dftoint_train.to_csv("apmod2_to_a"+str(p)+"mod2_distinct_ecq7_train_check.txt", sep='\t', index=False, header=False)
    

    # Test dataset
    dftoint_test = pd.DataFrame()
    dftoint_test['input'] =  df_test_cond1.iloc[:, :-1].agg(' '.join, axis=1)
    dftoint_test['output'] = df_test_cond1['a_'+str(p)].apply(lambda x : x)
    dftoint_test.to_csv("apmod2_to_a"+str(p)+"mod2_distinct_ecq7_test_check.txt", sep='\t', index=False, header=False)
    

    # Delete intermediate dataframes 
    del df
    del dfc
    del df_train1
    del df_train
    del dftoint_test
    del dftoint_train