## Set up

In [1]:
from os import listdir
import pandas as pd

## Combine predictions from multiple models

In [2]:
# load dataset with tweets and labels for development set
tweet_df = pd.read_csv('../data/derived/tweets_supervised_dev.csv')

# set tweet ID as head for easy joining later on
tweet_df.set_index('tweet_id', inplace=True)

# get files in prediction directory
predictions_directory = '../data/derived/predictions'
predictions_files = listdir(predictions_directory)

# get files in prediction directory related to development dataset
dev_files = [file for file in predictions_files if 'dev' in file]

# sort files in directory by name
dev_files.sort()

# iterate over development prediction files
for file in dev_files:

    # get model name from filename
    model_name = file[:-len('.csv')]

    # load predicted labels from file
    prediction_df = pd.read_csv(f'{predictions_directory}/{file}')

    # drop label and rename prediction columns in prediction dataframe
    prediction_df.rename(columns={'prediction':f'{model_name}'}, inplace=True)
    prediction_df.drop(columns='label', inplace=True)

    # set tweet ID as head for easy joining
    prediction_df.set_index('tweet_id', inplace=True)

    # left join predictions onto development dataset
    tweet_df = tweet_df.join(prediction_df, how='left')

# write summary to CSV
filepath_out = f'{predictions_directory}/supervised_summary.csv'
tweet_df.to_csv(filepath_out, index=True)