In [1]:
from google.colab import drive
drive.mount('/content/drive')

# Change directory to your code
%cd /content/drive/MyDrive/addition
%pwd   # verify youâ€™re in the right place
!ls    # should show train.py, 4_operands_addition.txt, etc.

Mounted at /content/drive
/content/drive/MyDrive/addition
configuration_files	       legacy_code	   result_analysis.ipynb
configurator.py		       llama_adapter.py    result_analysis.py
data			       llama_tokenizer.py  result_analysis_script
data_generate.py	       main_utilities.py   results
data_generation_script	       model.py		   startHere2.ipynb
error_examples		       model_rope.py	   startHere3.ipynb
evaluation.py		       model_t5bias.py	   startHere.ipynb
extra_result_analysis_scripts  __pycache__	   statistical_measurements.py
gsm_test		       README.md	   train.py


# I. Generate Data (choose one synthetic task)

## Addition

In [None]:
!python data_generate.py --task addition --num_operands 4 --experiment_name 4_operands_0_to_999_uniform --train_size 1000000 --test_size 10000 --val_size 10000 --train_eval True --sample-size 10000 --generate_reverse True

#### Ablation in Addition (e.g. randomize thousands-place of the output)

In [None]:
!python data_generate.py --task addition --randomize thousands --num_operands 4 --experiment_name 4_operands_0_to_999_output_randomize_thousands --train_size 1000000 --test_size 10000 --val_size 10000 --train_eval True --sample-size 10000 --generate_reverse True


#### Addition with scratchpad (form 1)

In [None]:
!python data_generate.py --task addition --reasoning_mode 1 --num_operands 4 --experiment_name 4_operands_0_to_999_uniform_scratchpad1 --train_size 1000000 --test_size 10000 --val_size 10000 --train_eval True --sample-size 10000 --generate_reverse True

#### Addition with scratchpad (form 2)

In [None]:
!python data_generate.py --task addition --reasoning_mode 2 --num_operands 4 --experiment_name 4_operands_0_to_999_uniform_scratchpad2 --train_size 1000000 --test_size 10000 --val_size 10000 --train_eval True --sample-size 10000 --generate_reverse True

## Multiplication

In [None]:
!python data_generate.py --task multiplication --experiment_name 40_digit_times_1_digit --train_size 1000000 --test_size 10000 --val_size 10000 \
--a_max_digits 40 --b_max_digits 1 --train_eval True --sample-size 10000 --generate_reverse True

## Comparison (Balanced data)

In [None]:
!python data_generate.py --task comparison --experiment_name comparison_bal --train_eval True --sample-size 5000

## Sorting (Doubly balanced data)


In [None]:
!python data_generate.py --task sorting --experiment_name 4_operands_sorting_doubly_balanced --train_eval True --sample-size 5000

# II. Let's Start Training!

#### The .txt file is the configuration file

## 4 Operands Addition

#### Reverse Output format

In [None]:
!python train.py 4_operands_addition_reversed.txt

#### Plain output format

In [None]:
!python train.py 4_operands_addition_plain.txt

#### NanoGPT Scaling (20M)

In [None]:
!python train.py 20M_4_operands_addition_plain.txt

#### NanoGPT Scaling (100M)

In [None]:
!python train.py 100M_4_operands_addition_plain.txt

#### Scratchpad Form 1

In [None]:
!python train.py 4_operands_addition_plain_scratchpad1.txt

#### Scratchpad Form 2

In [None]:
!python train.py 4_operands_addition_plain_scratchpad2.txt

#### Pythia Finetuning

In [None]:
!python train.py 4_operands_addition_plain_pythia.txt

##

## Simpel Multiplication

In [None]:
!python train.py 40_1_digits_mul_reversed.txt

## Comparison

In [None]:
!python train.py comparison_bal.txt

## Sorting

In [None]:
!python train.py 4_operands_sorting_doubly_bal.txt

## Alternative format of input sequence (Slicing)

In [None]:
!python train.py slicing_addition_4_operand_plain.txt --batch slicing

In [None]:
!python train.py slicing_addition_4_operand_reverse.txt --batch slicing

## Alternative Positional Encoding

In [None]:
!python train.py 4_operands_addition_reversed.txt --PE RoPE

In [None]:
!python train.py 4_operands_addition_reversed.txt --PE t5

## Greedy Decoding

In [None]:
!python train.py 4_operands_addition_reversed.txt --greedy

In [None]:
!python train.py 4_operands_addition_plain.txt --greedy

# III. Result Analysis

## Addition Task

#### Digitwise Error Rates (4 operands addition)

In [5]:
!python result_analysis_script/digitwise_error.py results/4_operands_0_to_999_uniform/reverse_out_test_garbage/4_operands_0_to_999_uniform_reverse/test_reverse_results.csv

  df = pd.read_csv(CSV_PATH)
[(0, 'pred_iter_0'), (2000, 'pred_iter_2000'), (4000, 'pred_iter_4000'), (6000, 'pred_iter_6000'), (8000, 'pred_iter_8000'), (10000, 'pred_iter_10000'), (12000, 'pred_iter_12000'), (14000, 'pred_iter_14000'), (16000, 'pred_iter_16000'), (18000, 'pred_iter_18000'), (20000, 'pred_iter_20000'), (22000, 'pred_iter_22000'), (24000, 'pred_iter_24000'), (26000, 'pred_iter_26000'), (28000, 'pred_iter_28000'), (30000, 'pred_iter_30000'), (32000, 'pred_iter_32000'), (34000, 'pred_iter_34000'), (36000, 'pred_iter_36000'), (38000, 'pred_iter_38000'), (40000, 'pred_iter_40000'), (42000, 'pred_iter_42000'), (44000, 'pred_iter_44000'), (46000, 'pred_iter_46000'), (48000, 'pred_iter_48000'), (50000, 'pred_iter_50000'), (52000, 'pred_iter_52000'), (54000, 'pred_iter_54000'), (56000, 'pred_iter_56000'), (58000, 'pred_iter_58000'), (60000, 'pred_iter_60000'), (62000, 'pred_iter_62000'), (64000, 'pred_iter_64000'), (66000, 'pred_iter_66000'), (68000, 'pred_iter_68000'), (70000

#### Fit Normal

In [None]:
!python result_analysis_script/fit_normal.py \
  --input results/4_operands_0_to_999_uniform/reverse_out_early_dense_eval/early_dense_eval_for_normal_distr_4_operands_0_to_999_uniform_reverse/test_reverse_results.csv \
  --iter-start 1000 --iter-end 1800 --iter-step 200 \
  --diff-min -800 --diff-max 800


In [None]:
!python result_analysis_script/fit_normal.py \
  --input results/4_operands_0_to_999_uniform/reverse_out/4_operands_0_to_999_uniform_reverse/test_reverse_results.csv \
  --iter-start 8000 --iter-end 12000 --iter-step 2000 \
  --diff-min -100 --diff-max 100


In [None]:
!python result_analysis_script/fit_normal.py \
  --input results/4_operands_0_to_999_uniform/reverse_out/4_operands_0_to_999_uniform_reverse/test_reverse_results.csv \
  --iter-start 60000 --iter-end 64000 --iter-step 2000 \
  --diff-min -20 --diff-max 20


#### Mutual Information Plot

In [None]:
!python result_analysis_script/plot_mi_metrics.py \
  results/4_operands_0_to_999_uniform/reverse_out_complete_MI_1M_lines/4_operands_0_to_999_uniform_reverse/mi_metrics.csv

#### Scaling, Scratchpad, and Pythia finetuning

In [None]:
!python result_analysis_script/scaling_scratchpad_finetuning.py \
  --test "20M NanoGPT" results/4_operands_0_to_999_uniform/20M_reverse_out/20M4_operands_0_to_999_uniform_reverse/test_reverse_results.csv 20000 False \
  --test "100M NanoGPT" results/4_operands_0_to_999_uniform/100M_reverse_out/100M_4_operands_0_to_999_uniform_reverse/test_reverse_results.csv 50000 False \
  --test "Pyhtia 1B" results/4_operands_0_to_999_uniform/plain_out_pythia/4_operands_addition_plain_pythia_1b/test_results.csv 6000 False \
  --test "Scratchpad D" results/4_operands_0_to_999_uniform_scratchpad1/plain_out/4_operands_0_to_999_uniform_plain_scratchpad1/test_scratchpad1_results.csv 4500 True \
  --test "Scratchpad A + D" results/4_operands_0_to_999_uniform_scratchpad2/plain_out/4_operands_0_to_999_uniform_plain_scratchpad2_1/test_scratchpad2_results.csv 500 True

## Simple Multiplication Task

#### Digitwise Error (Simple multiplication, Colormap)

In [None]:
!python result_analysis_script/mul_digitwise_error_colormap.py results/40_digit_times_1_digit/reverse_out/40_digit_times_1_digit/test_reverse_results.csv --max_steps 3000

## Comparison Task




#### Comparison Error Rate (Contrast Pairs)

In [9]:
!python result_analysis_script/comparison_error_rate.py \
  results/comparison_bal/comparison_bal_1/thousands_diff_only_results.csv \
  results/comparison_bal/comparison_bal_1/hundreds_diff_only_results.csv \
  results/comparison_bal/comparison_bal_1/tens_diff_only_results.csv \
  results/comparison_bal/comparison_bal_1/units_diff_only_results.csv \
  --output_file_name contrast_pair_error_rate

Saved plot to results/comparison_bal/comparison_bal_1/contrast_pair_error_rate


## Sorting Task

#### Sorting Subskill from 10% to 90% Range

In [None]:
!python result_analysis_script/sorting_acc_10_90_range.py \
  --csv \
    results/4_operands_sorting_doubly_balanced/conflicting_same_control_exp_correction/4_operands_sorting_doubly_balanced_conflicting_same_correction/test_results.csv \
    results/4_operands_sorting_doubly_balanced/conflicting_same_control_exp_correction/4_operands_sorting_doubly_balanced_conflicting_same_correction/digitwise_random_results.csv \
    results/4_operands_sorting_doubly_balanced/conflicting_same_control_exp_correction/4_operands_sorting_doubly_balanced_conflicting_same_correction/digitwise_thousand_results.csv \
    results/4_operands_sorting_doubly_balanced/conflicting_same_control_exp_correction/4_operands_sorting_doubly_balanced_conflicting_same_correction/digitwise_hundred_results.csv \
    results/4_operands_sorting_doubly_balanced/conflicting_same_control_exp_correction/4_operands_sorting_doubly_balanced_conflicting_same_correction/digitwise_ten_results.csv \
  --positions 1,2,3,4 \
  --mode length first second third fourth


#### Sorting Mixing Error

In [None]:
!python result_analysis_script/mixing_error.py results/4_operands_sorting_doubly_balanced/conflicting_same_control_exp_correction_v2/4_operands_sorting_doubly_balanced_conflicting_same_correction_v2/1_3_same_2_4_agreeing_v2_results.csv
