# IMDB (DistilBERT) Publication Suite

Multi-seed IMDB fine-tuning comparing AdamW vs SGD+Momentum. Saves per-run CSVs in `results/` and a paired statistical comparison CSV.

In [None]:
# Optional: install dependencies on Kaggle
# !pip -q install transformers datasets accelerate

In [None]:
import sys, torch
print('Python:', sys.version)
print('Torch:', torch.__version__)
print('CUDA available:', torch.cuda.is_available())
print('Device:', 'cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Quick multi-seed run
!python nlp_publication.py --quick

In [None]:
# Show statistical comparison (if exists)
import pandas as pd, os
stats_path = 'results/imdb_statistical_comparisons_publication.csv'
if os.path.exists(stats_path):
    display(pd.read_csv(stats_path).tail())
else:
    print('No stats CSV found yet.')

In [None]:
# Plot test_acc for the most recent run
import glob, pandas as pd, matplotlib.pyplot as plt
files = sorted(glob.glob('results/NN_DistilBERT_IMDB_*_publication.csv'))
if files:
    last = files[-1]
    df = pd.read_csv(last)
    if 'test_acc' in df.columns:
        plt.figure(figsize=(6,4))
        plt.plot(df['epoch'], df['test_acc'], marker='o')
        plt.title(f'Test Accuracy per Epoch\n{last}')
        plt.xlabel('epoch'); plt.ylabel('test_acc'); plt.grid(True, alpha=0.3)
        plt.show()
    else:
        print('No test_acc column in last results file.')
else:
    print('No results files found yet.')

## Resume runs
Use checkpoints to resume an interrupted run:

````
!python nlp_publication.py --seeds 1,2,3 --epochs 5 --batch-size 16 --resume --ckpt-dir checkpoints
````