# To set up environment I have exported my working env to a .yml file.

Just run `conda env create -f environment.yml` to install my environment.

if you would rather you could make the environment from scratch.

In [5]:
import os
import sys
import json
import random
from collections import defaultdict

# Set project root once
PROJECT_ROOT = "./"
os.chdir(PROJECT_ROOT)
sys.path.append(PROJECT_ROOT)

import yaml
import torch
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from matplotlib import cm
from tqdm.auto import tqdm
from sklearn.metrics import roc_curve, auc, log_loss, accuracy_score


# Dataset Download

first you need to download the dataset used in the paper from this anonymus dropbox link (it may take a minute)

In [4]:
import requests, zipfile, io
import os

def download_and_unzip_dropbox(dropbox_url, extract_to="."):
    # Transform to direct download link
    if "?dl=0" in dropbox_url:
        download_url = dropbox_url.replace("?dl=0", "?dl=1")
    elif "?dl=1" not in dropbox_url:
        download_url = dropbox_url + "?dl=1"
    else:
        download_url = dropbox_url

    # Download and unzip
    response = requests.get(download_url)
    response.raise_for_status()

    with zipfile.ZipFile(io.BytesIO(response.content)) as zip_ref:
        zip_ref.extractall(extract_to)
        print(f"Extracted to {os.path.abspath(extract_to)}")

dropbox_url = "https://dl.dropboxusercontent.com/scl/fi/bk0gcvfy2ytni5eeu4gah/dataset.zip?rlkey=da701q1fc3xas3s1et76h5f7a&st=ft5099v2&dl=0"
download_and_unzip_dropbox(dropbox_url)

Extracted to /u/li19/moss_workshop_code


# Set up

In [6]:
from definition import *
from model.device_check import *
import tool.dynamic as dynamic
from engine.tweet import FixTensorFusion

In [7]:
# lock all random seed to make the experiment replicable
seed = 1

torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)  # if you are using multi-GPU.
np.random.seed(seed)  # Numpy module.
random.seed(seed)  # Python random module.
torch.manual_seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [8]:
RESULTS_ROOT = os.path.join(PROJECT_ROOT, "results")

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Training

In [10]:
def main_train(config, exp_version=None, reset=False, nTrials=1, verbose=True, synth=False, shorten=False):
    # Load configuration file
    with open(os.path.join(PROJECT_ROOT, "config", "tweet", config), 'r') as f:
        cfg = yaml.load(f, Loader=yaml.FullLoader)
    print(cfg)
    cfg["data"]["synthetic"] = synth
    exp_name = cfg["exp_name"]
    base_name = exp_name
    for n in range(nTrials):
        exp_name = base_name
        if nTrials != 1:
            exp_name += "_t{}".format(n)

        if exp_version is not None:
            exp_name += "_e_{}".format(exp_version)

        if not os.path.exists(os.path.join(RESULTS_ROOT, exp_name)):
            os.makedirs(os.path.join(RESULTS_ROOT, exp_name))

        # Path to save output files, like losses, scores, figures etc.
        report_path = os.path.join(RESULTS_ROOT, exp_name)

        # Initialize test performances
        test_performances = []

        # Progress tracking file path
        progress_file = os.path.join(report_path, "latest_subset.json")

        if shorten:
            j = 12
        else:
            j = 4
        while j < 15:
            j += 1
            i = (2 ** j)
            cur_cfg = cfg.copy()
            cur_cfg["data"]["batch_size"] = 16
            cur_exp_name = "subset{}".format(i)
            cur_subset_num = i + 1
            cur_cfg["exp_name"] = cur_exp_name
            cur_cfg["data"]["num_subsets"] = i

            # Create directory for the current subset if it doesn't exist
            if not os.path.exists(os.path.join(report_path, cur_exp_name)):
                os.makedirs(os.path.join(report_path, cur_exp_name))

            # Path to save output files for the current subset
            cur_report_path = os.path.join(report_path, cur_exp_name)
            
            if verbose:
                print(f"Current exp: {cur_exp_name}")
            
            # Initialize and run training
            p = dynamic.import_string(cur_cfg["engine"])(cur_cfg, cur_report_path)
            p.verbose = True
            p.train(cfg["train"])
            test_loss, test_acc = p.test()
            print("Subset size {}, test accuracy {}".format((i + 1) * 20, test_acc))
            test_performances.append(test_acc)

            # Save test performances
            with open(os.path.join(report_path, "test_performances.json"), "w") as outfile:
                outfile.write(json.dumps(test_performances))

            # Update and save progress to the latest_subset file
            with open(progress_file, "w") as outfile:
                json.dump({"latest_subset": i + 1}, outfile)


# Set fusion method and data source here

to train all models listed in paper you must run the training code using all configs with and without `synth == True`

Not all settings must be run to plot results, for easiest testing do a single train and plot results

In [None]:
all_cfg = ["concat_tweet.yaml", "tensorfusion_tweet.yaml", "product_tweet.yaml"]
cfg = all_cfg[0]
synth = False

if synth:
    exp = "synth"
else:
    exp = "basic"

main_train(
    config = cfg,
    exp_version = exp, 
    reset = False, 
    nTrials = 1, 
    synth = synth,
    shorten=True
)

# Accuracy Plot

In [None]:
folders = [f for f in os.listdir(RESULTS_ROOT) if "tweet" in f]
all_data = {}
for f in [fold for fold in folders if "test_performances.json" in os.listdir(os.path.join(RESULTS_ROOT, fold))]:
    log_file = os.path.join(RESULTS_ROOT, f, "test_performances.json")
    with open(log_file, 'r') as file:
        data = json.load(file)
    all_data[f] = data
    
window_size = 2  # Smoothing window size

colors = sorted(
            mcolors.BASE_COLORS, key=lambda c: tuple(mcolors.rgb_to_hsv(mcolors.to_rgb(c))))

concat_base = []
concat_synth = []

prod_base = []
prod_synth = []

tfuse_base = []
tfuse_synth = []

all_sets = [concat_base,concat_synth,prod_base,prod_synth,tfuse_base,tfuse_synth]
labels = ["concat_base","concat_synth","prod_base","prod_synth","tfuse_base","tfuse_synth"]
for t in list(all_data.keys()):
    idx = 0
    if "prod" in t:
        idx += 2
    elif "tensorfusion" in t:
        idx += 4
        
    if "synth" in t:
        idx += 1
    # if len(all_data[t]) == 11:
    all_sets[idx].append(all_data[t])
    
all_sets = [np.array(subset) for subset in all_sets]

all_means = []
all_stds = []

for model_data_pair in all_sets:
    all_means.append(model_data_pair.mean(axis=0))
    all_stds.append(model_data_pair.std(axis=0))
x = np.arange(len(all_means[0]))

In [None]:
plt.figure(figsize=(10,5))    

for mean, std, label in zip(all_means, all_stds, labels):
    if "synth" in label:
        line_style = 'dashed'
    else:
        line_style = 'solid'
    if "concat" in label:
        color = "blue"
    elif "tfuse" in label:
        color = 'red'
    else:
        color = 'green'

    show_label = label.replace("_", " ").replace("synth", "synthetic").replace("tfuse", "tensorfusion").replace("prod", "product").replace("base", "real")
    if mean.size != 0 and not np.isnan(mean).all():
        print(mean)
        plt.errorbar(
            x, 
            mean, 
            label = show_label, 
            yerr=std, 
            capsize=2, 
            elinewidth=2, 
            markeredgewidth=1,
            linestyle=line_style,
            color=color
        )

plt.xlabel("Dataset Size in Chunks of 2^(x+5)")
plt.ylabel("Validation Accuracy")
plt.legend()
plt.show()

# ROC Plot

In [None]:
# Define base directory
base_dir = RESULTS_ROOT

# Define groups based on naming conventions
fusion_methods = ["concat", "prodconcat", "tensorfusion"]
environments = [
    "e_demo", 
    "e_basic",
    "e_synth"
]

# Create a dictionary to hold group data
group_data = {f"{fusion}_{env}": [] for fusion in fusion_methods for env in environments}

# Identify all trials
trial_folders = [folder for folder in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, folder))]

# Group trials by identifiers
for folder in trial_folders:
    for fusion in fusion_methods:
        for env in environments:
            if fusion in folder and env in folder:
                if fusion == "concat":
                    if "prodconcat" not in folder:
                        group_data[f"{fusion}_{env}"].append(folder)
                else:
                    group_data[f"{fusion}_{env}"].append(folder)


In [None]:

# List of subset folders and their sizes
subset_folders = [
    "subset32", "subset64", "subset128", "subset256", "subset512",
    "subset1024", "subset2048", "subset4096", "subset8192", "subset16384", "subset32768"
]
sizes = [int(folder.replace("subset", "")) for folder in subset_folders]
normalized_sizes = (np.array(sizes) - min(sizes)) / (max(sizes) - min(sizes))
colors = cm.viridis(normalized_sizes)  # Gradient colors for subsets

# Function to compute ROC curve for subsets within a group
def group_subset_roc(group_folders):
    subset_results = defaultdict(list)
    for folder in group_folders:
        for subset in subset_folders:
            test_results_path = os.path.join(base_dir, folder, subset, f"{subset}_test_results.json")
            if not os.path.exists(test_results_path):
                print(f"Test results file not found: {test_results_path}")
                continue

            with open(test_results_path, 'r') as f:
                data = json.load(f)

            ground_truth = np.array(data['ground_truth'])
            predictions = np.array(data['predict'])

            # Extract true labels and predicted scores
            ground_truth_labels = ground_truth[:, 1]
            predicted_scores = predictions[:, 1]

            # Compute ROC curve
            fpr, tpr, _ = roc_curve(ground_truth_labels, predicted_scores)
            subset_results[subset].append((fpr, tpr))

    # Average TPRs across trials for each subset
    mean_results = {}
    for subset, curves in subset_results.items():
        all_tprs = [np.interp(np.linspace(0, 1, 100), fpr, tpr) for fpr, tpr in curves]
        mean_tpr = np.mean(all_tprs, axis=0)
        mean_fpr = np.linspace(0, 1, 100)
        mean_results[subset] = (mean_fpr, mean_tpr)

    return mean_results


all_means = {key: group_subset_roc(group_data[key]) for key in group_data.keys()}
# Plotting
fig, axes = plt.subplots(2, 3, figsize=(12, 6))
axes = axes.ravel()

groups = ["_".join(t.split("_")[1:]) for t in trial_folders]

for idx, group_name in enumerate(groups):
    group_folder = group_data[group_name]
    mean_results = all_means[group_name]
    ax = axes[idx]
    i = 0
    for subset, color in zip(subset_folders, colors):
        i += 1
        if subset not in mean_results:
            continue
        mean_fpr, mean_tpr = mean_results[subset]
        roc_auc = auc(mean_fpr, mean_tpr)
        
        
        if "concat" in group_name:
            color = "blue"
        if "tensorfusion" in group_name:
            color = 'red'
        if "prodconcat" in group_name:
            color = 'green'
        
        alpha = i / 15
        
        ax.plot(mean_fpr, mean_tpr, color=color, alpha=alpha, lw=2, 
                label=f"N={subset[6:]}")

    # Plot random guess line and formatting
    ax.plot([0, 1], [0, 1], color="gray", linestyle="--", lw=1)
    ax.set_title(group_name.replace("_", " ").replace(" e ", " ").replace("synth", "synthetic").replace("tfuse", "tensorfusion").replace("prodconcat", "product concat").replace("basic", "real"), fontsize=14)
    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])
    ax.set_xlabel("False Positive Rate")
    ax.set_ylabel("True Positive Rate")
    ax.legend(loc="lower right", fontsize=8)
    ax.grid(alpha=0.3)

# Adjust layout
plt.tight_layout()
plt.show()

# Robustness Plot

In [None]:
# Define the order of subsets by dataset size
subsets_order = [
    "subset32",
    "subset64",
    "subset128",
    "subset256",
    "subset512",
    "subset1024",
    "subset2048",
    "subset4096",
    "subset8192",
    "subset16384",
    "subset32768"
]

def parse_model_name(model_name):
    """
    Parse model directory name of the form:
    tweet_<FUSION_TYPE>_t<NUMBER>_e_<DATA_SOURCE>
    Returns (fusion_type, data_source).
    """
    splits = model_name.split("_")
    if len(splits) < 3:
        return None, None
    else:
        return splits[1], splits[3]

def find_test_performance(test_perf_path,trials):
    """
    Given a path to test_performances.json, return both the maximum performance score
    and the subset that gave that best performance.
    """
    with open(test_perf_path, 'r') as f:
        perf_data = json.load(f)
        
    # perf_data is a list of scores corresponding to subsets in ascending order.
    max_score = max(perf_data)
    best_index = perf_data.index(max_score)
    best_subset = trials[best_index] if best_index < len(trials) else "Unknown subset"
    
    return max_score, best_subset

def get_best_models(models_root="models"):
    # Dictionary to keep track of the best model per (fusion_type, data_source)
    # Format: best_models[(fusion_type, data_source)] = (best_score, model_path, best_subset)
    best_models = {}

    # Iterate over all items in the models_root directory
    for model_dir in os.listdir(models_root):
        model_path = os.path.join(models_root, model_dir)
        if not os.path.isdir(model_path):
            continue
        
        # Parse model name
        fusion_type, data_source = parse_model_name(model_dir)
        if not fusion_type or not data_source:
            # Not matching our pattern, skip
            continue
        
        # test_performances.json file path
        test_perf_file = os.path.join(model_path, "test_performances.json")
        if not os.path.exists(test_perf_file):
            # No performance file found, skip
            continue
        
        # Get performance (score and best subset)
        trials = [f for f in os.listdir(model_path) if "subset" in f and ".json" not in f]
        score, best_subset = find_test_performance(test_perf_file,trials)
        
        # Check if this is the best model so far for (fusion_type, data_source)
        key = (fusion_type, data_source)
        if key not in best_models or score > best_models[key][0]:
            best_models[key] = (score, model_path, best_subset)
        print(model_dir)
    # Print out the best models
    for (fusion_type, data_source), (score, path, subset) in best_models.items():
        print(f"For (fusion_type={fusion_type}, data_source={data_source}), best model: {path} with score {score} on {subset}")
    return best_models
    
best_models = get_best_models(RESULTS_ROOT)

In [None]:
def get_config_for(fusion_type, data_source,conf_root="/u/li19/superfuse/config/tweet"):
    # Map each fusion_type to the corresponding configuration file
    config_map = {
        "concat": "concat_tweet.yaml",
        "prodconcat": "product_tweet.yaml",
        "tensorfusion": "tensorfusion_tweet.yaml"
    }

    if fusion_type not in config_map:
        raise ValueError(f"Unknown fusion_type '{fusion_type}'. Supported types: {list(config_map.keys())}")
    
    config_file = os.path.join(conf_root, config_map[fusion_type])

    if not os.path.exists(config_file):
        raise FileNotFoundError(f"Configuration file {config_file} not found.")

    with open(config_file, 'r') as f:
        config = yaml.safe_load(f)

    # Extract the data and train configurations
    # Assumes these keys exist in the YAML.
    cfg_data = config.get('data', {})
    cfg_train = config.get('train', {})

    return cfg_data, cfg_train, config


def add_noise_to_batch(input_tuple, noise_level=0.1, apply_to='text'):
    """
    Adds Gaussian noise to one of the modalities in the input_tuple.
    input_tuple is expected to be (Vv, Tt, Y, V).
    noise_level: standard deviation of the Gaussian noise.
    apply_to: which modality to apply noise to ('text' or 'image').
    """
    Vv, Tt, Y, V = input_tuple

    # Convert to float tensor for consistent operations
    Vv = Vv.float()
    Tt = Tt.float()
    
    v_params = torch.tensor([0.0333, 6.1028])
    t_params = torch.tensor([0.0185, 0.3924])


    if apply_to == 'text':
        # Add noise to text embeddings
        noise = torch.randn_like(Tt) * v_params[0] + v_params[1]
        Tt_noisy = Tt + (noise * noise_level)
        return (Vv, Tt_noisy, Y, V)
    elif apply_to == 'image':
        # Add noise to image embeddings
        noise = torch.randn_like(Vv) * v_params[0] + v_params[1]
        Vv_noisy = Vv + (noise * noise_level)
        return (Vv_noisy, Tt, Y, V)
    elif apply_to == 'both':
        # Add noise to image embeddings
        v_noise = torch.randn_like(Vv) * v_params[0] + v_params[1]
        t_noise = torch.randn_like(Tt) * t_params[0] + t_params[1]
        Vv_noisy = Vv + (v_noise * noise_level)
        Tt_noisy = Tt + (t_noise * noise_level)
        return (Vv_noisy, Tt_noisy, Y, V)
    else:
        # No noise if apply_to is invalid
        return (Vv, Tt, Y, V)


def evaluate_with_noise(engine, loader, noise_levels=[0.0, 0.05, 0.1, 0.2], apply_to='text'):
    """
    Evaluate model performance at various noise levels.
    """
    results = {}
    engine.set_eval()

    for nl in noise_levels:
        ground_truth = []
        predict = []

        with torch.no_grad():
            for batch_idx, input_tuple in enumerate(loader):
                noisy_tuple = add_noise_to_batch(input_tuple, noise_level=nl, apply_to=apply_to)
                output, labels = engine.forward_pass(noisy_tuple)

                ground_truth.append(labels.long().cpu().data.numpy())
                predict.append(output.cpu().data.numpy())

        # Concatenate arrays
        ground_truth = np.concatenate(ground_truth, axis=0)
        predict = np.concatenate(predict, axis=0)

        ce_loss = log_loss(ground_truth, predict)
        predict_labels = np.argmax(predict, axis=1)
        ground_truth_labels = ground_truth
        
        # Convert ground_truth to indices if it's one-hot
        ground_truth_labels = np.argmax(ground_truth, axis=1)
        predict_labels = np.argmax(predict, axis=1)
        
        accuracy = accuracy_score(ground_truth_labels, predict_labels)

        results[nl] = {
            'ce_loss': ce_loss,
            'accuracy': accuracy
        }

    return results


def trial(noise_levels = [0.0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95], apply_noise_to = 'text'):
    experiment_results = {}
    print(f"Running with noise on {apply_noise_to}")
    for (fusion_type, data_source), (best_score, model_path, best_subset) in tqdm(best_models.items()):
        cfg_data, cfg_train, cfg = get_config_for(fusion_type, data_source)  # You need to define this

        synth = False

        if "synth" in data_source:
            synth = True

        if "synthetic" not in cfg["data"].keys():
            cfg["data"]["synthetic"] = synth
            cfg_data["synthetic"] = synth

        result_path = "out"

        engine = FixTensorFusion(cfg, result_path)
        engine.init_dataset(cfg_data)
        engine.init_models(cfg_train)

        subset_path = os.path.join(model_path, best_subset, "model.pth.tar")
        # Load the best model weights
        if not os.path.exists(subset_path):
            print("Skipping: ", subset_path)
            continue
        checkpoint = torch.load(subset_path, map_location=device,weights_only=False)

        for m in engine.trained_models:
            if m != "fusion":
                getattr(engine, m).load_state_dict(checkpoint[m])

        engine.set_eval()

        # Create dataloader for evaluation
        # Decide if you want to evaluate on validation or test set
        val_loader = torch.utils.data.DataLoader(
            engine.val_set,
            batch_size=cfg_data["val_batch_size"],
            shuffle=False
        )

        # Evaluate model performance under increasing noise levels
        results = evaluate_with_noise(engine, val_loader, noise_levels=noise_levels, apply_to=apply_noise_to)
        print(results)
        experiment_results[(fusion_type, data_source)] = results
    return experiment_results



t_data = trial(apply_noise_to="text")
i_data = trial(apply_noise_to="image")
b_data = trial(apply_noise_to="both")

In [None]:
plt.rcParams.update({
    'font.size': 14,
    'axes.labelsize': 14,
    'axes.titlesize': 16,
    'legend.fontsize': 12,
    'xtick.labelsize': 12,
    'ytick.labelsize': 12,
    'font.family': 'sans-serif',
    'font.sans-serif': ['DejaVu Sans'],
    'figure.figsize': (12, 6)
})

fig, ax_acc = plt.subplots(1,3)

# Define color for each fusion type
fusion_colors = {
    'prodconcat': 'green',
    'concat': 'blue',
    'tensorfusion': 'red'
}

# Define line style for each data source
source_styles = {
    'synth': '--',
    'basic': '-',
    'demo': '-'
}

source_markers = {
    'synth': 'o',
    'basic': 's',
    'demo': 's'
}

all_res = [t_data, i_data, b_data]
labels = ["Textual Noise", "Image Noise", "Both Noised"]
noise_levels = [0.0, 0.05, 0.1, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

for i, ax in enumerate(ax_acc.flatten()):
    # Plot accuracy
    results = all_res[i]
    for key, noise_data in results.items():
        fusion_type, data_source = key
        # Extract accuracies for each noise level
        accuracies = [noise_data[n]['accuracy'] for n in noise_levels]

        color = fusion_colors[fusion_type]
        linestyle = source_styles[data_source]
        marker = source_markers[data_source]

        ax.plot(
            noise_levels, accuracies, 
            linestyle=linestyle, 
            color=color, 
            # marker=marker,
            markersize=6,
            linewidth=2,
            label=f"{fusion_type}-{data_source}".replace("-", " ").replace("basic", "Real").replace("prodconcat", "Product Concat").replace("tensorfusion", "tensor-fusion").title()
        )

    # Set y-axis label and subplot title
    ax.set_title(labels[i])

    # Only show x tick labels and x label on the last plot
    if i > 0:
        # Turn off x tick labels for all but the last subplot
        ax.tick_params(axis='y', which='both', labelleft=False)
        # ax.set_ylabel("Accuracy")
    else:
        # For the last subplot, show the x-axis label
        ax.set_ylabel("Accuracy")

    ax.set_xlabel("Noise Level")

    # Add a subtle grid
    ax.grid(True, linestyle=':', linewidth=0.7, alpha=0.8)
    ax.set_ylim(0, 1)

    # Add legend only to the top plot
    if i == 2:
        # Get current handles and labels from the plot
        handles, labels_legend = ax.get_legend_handles_labels()

        # Sort by label
        sorted_pairs = sorted(zip(labels_legend, handles), key=lambda x: x[0])
        sorted_labels, sorted_handles = zip(*sorted_pairs)

        # Now create the legend with the sorted entries
        ax.legend(sorted_handles, sorted_labels, loc='best')

# Adjust layout for better spacing
plt.tight_layout()

# Save and show the figure
plt.savefig("model_performance_noise_accuracy_only.png", dpi=300)
plt.show()
