## Train-Test Split
For models that require training, we need to perform certain splits

In [2]:
import pandas as pd
import os
import numpy as np

### Categorical Split

In [3]:
TRAIN_CATEGORIES = [
 'Sales and Market Trends',
 'Wildlife and Nature Observation',
 'Environmental and Climate Data',
 'Energy and Resource Consumption',
 'Technological and Digital Trends',
 'Recreational and Entertainment Trends',
 'Transport and Traffic Trends']

TEST_CATEGORIES = ['Health and Medical Data',
 'Agricultural and Food Production',
 'Educational and Public Services'
]

In [4]:
categories = pd.read_json("categorized_output.json",lines=True)[["uuid","category"]].drop_duplicates()
categories.rename(columns={"category":"scenario_category"}, inplace=True)
categories["scenario_category"] = categories["scenario_category"].str.split("Category:").str[-1].str.strip().str.removeprefix("'").str.removesuffix("'")

desciptions = pd.read_json("data/processed/ts2desc/v2.jsonl",lines=True)
desciptions = desciptions.join(categories.set_index("uuid"),on="uuid", how="left")

def train_test_split_categorical(df, out_path, val_size=1000, return_dfs=False, shuffle=False):
    if shuffle:
        df = df.sample(frac=1, random_state=42)
    df = df.join(categories.set_index("uuid"),on="uuid", how="left")
    train_df = df[df["scenario_category"].isin(TRAIN_CATEGORIES)].sample(frac=1, random_state=42)
    val_df = train_df.sample(n=val_size, random_state=42)
    train_df.drop(val_df.index, inplace=True)
    test_df = df[df["scenario_category"].isin(TEST_CATEGORIES)]

    if return_dfs:
        return train_df, val_df, test_df
    
    train_df.to_json(os.path.join(out_path,"train.json"),index=False, lines=True, orient="records")
    val_df.to_json(os.path.join(out_path,"val.json"), index=False, lines=True, orient="records")
    test_df.to_json(os.path.join(out_path,"test.json"),index=False, lines=True, orient="records")


In [None]:

# train_test_split_categorical(desciptions, "data/processed/ts2stats_mcq_mike")

### Description MCQs

In [2]:
!python src/data/make_mcq.py --input_file  data/processed/ts2desc/train.json  --output_file data/processed/ts2desc_mcq/train.json --num_total_options 4 --label_col="description"
!python src/data/make_mcq.py --input_file  data/processed/ts2desc/val.json  --output_file data/processed/ts2desc_mcq/val.json --num_total_options 4 --label_col="description"
!python src/data/make_mcq.py --input_file  data/processed/ts2desc/test.json  --output_file data/processed/ts2desc_mcq/test.json --num_total_options 4 --label_col="description"

['description', 'description_short', 'description_tiny', 'characteristics', 'generator', 'metadata', 'series', 'uuid', 'scenario_category', 'options']
['description', 'description_short', 'description_tiny', 'characteristics', 'generator', 'metadata', 'series', 'uuid', 'scenario_category', 'options']
['description', 'description_short', 'description_tiny', 'characteristics', 'generator', 'metadata', 'series', 'uuid', 'scenario_category', 'options']


### Statistical MCQs

In [17]:
! python src/data/make_stat_mcq.py data/processed/ts2desc/train.json data/processed/ts2stats_mcq_mike/train.json
! python src/data/make_stat_mcq.py data/processed/ts2desc/val.json data/processed/ts2stats_mcq_mike/val.json
! python src/data/make_stat_mcq.py data/processed/ts2desc/test.json data/processed/ts2stats_mcq_mike/test.json

  x = um.multiply(x, x, out=x)
  return a + (b - a) * self.random()


### Augmented QA
Try splitting up the questions GPT-4 got wrong

In [None]:
/gscratch/bdata/datasets/llms_and_timeseries/Delete/v2_MCQ.json

In [4]:
gpt_incorrect_qa = pd.read_json("/gscratch/bdata/datasets/llms_and_timeseries/Delete/v2_MCQ.json", lines=True)
question_categories = ["counterfactual", "explanation", "argumentation", "analogical", "fact"]
gpt_incorrect_qa = gpt_incorrect_qa[gpt_incorrect_qa["category"].isin(question_categories)]      

In [None]:
train_test_split_categorical(gpt_incorrect_qa, "data/processed/qa_gpt4_incorrect", val_size=100)

In [4]:
counterfactual_qa = pd.read_json("/gscratch/bdata/datasets/llms_and_timeseries/Counterfactual/CF_Feb_1.json", lines=True)

In [11]:
train_test_split_categorical(counterfactual_qa, "data/processed/counterfactual_qa_mcq", val_size=100, shuffle=True)

In [9]:
def make_llava_example(row):
    return {
        "id": f'{row["uuid"]}_{row["ts_qid"]}',
        "image": row["image_path"],
        "conversations": [
          {
            "from": "human",
            "value": f"<image>\n{row['description_tiny']}\n{row['metadata']}\n{row['question']}"
          },
          {
            "from": "gpt",
            "value": row["options"][row["answer_index"]]
          },
        ]
      }
    

def make_llava_example_mcq(row):
    options = row["options"]
    letters = ["A","B","C","D"]
    options_str = "\n".join([f"{letters[i]}) {option}" for i, option in enumerate(options)])
    answer_str = f"{letters[row['answer_index']]})"
    return {
        "id": f'{row["uuid"]}',
        "image": row["image_path"],
        "conversations": [
          {
            "from": "human",
            "value": f"<image>\nPlease answer this question by picking from the options:\n{row['description_tiny']}\n{row['metadata']}\n{row['question']}\n{options_str}"
          },
          {
            "from": "gpt",
            "value": answer_str
          },
        ]
      }


def make_llava_example_desc(row):
    options = row["options"]
    np.random.shuffle(options)
    letters = ["A","B","C","D"]
    options_str = "\n".join([f"{letters[i]}) {option}" for i, option in enumerate(options)])
    answer_index = options.index(row["description"])
    answer_str = f"{letters[answer_index]}){row['description']}"
    return {
        "id": f'{row["uuid"]}',
        "image": row["image_path"],
        "conversations": [
          {
            "from": "human",
            "value": f"<image>\nPlease pick the correct description\n{options_str}"
          },
          {
            "from": "gpt",
            "value": answer_str
          },
        ]
      }


def format_examples_for_llava(df, image_dir, out_path, formatter=make_llava_example):
    
    for i, row in df.iterrows():
        expected_image_path = os.path.join(image_dir, f"{row['uuid']}.png")
        if not os.path.exists(expected_image_path):
            continue
        df.at[i,"image_path"] = f"{row['uuid']}.png"
    print(len(df))
    df = df[df["image_path"]!="nan"]
    print((df["image_path"]=="nan").sum())
    df["llava_example"] = df.apply(formatter, axis=1)
    df["llava_example"].to_json(out_path, orient="records") 

In [7]:
IMAGE_DIR = "/gscratch/bdata/mikeam/TSandLanguage/data/processed/ts_as_img/all"

In [None]:

qpt_incorrect_qa_train = pd.read_json("data/processed/qa_gpt4_incorrect/train.json", lines=True)
qpt_incorrect_qa_val = pd.read_json("data/processed/qa_gpt4_incorrect/val.json", lines=True)
qpt_incorrect_qa_test = pd.read_json("data/processed/qa_gpt4_incorrect/test.json", lines=True)

format_examples_for_llava(qpt_incorrect_qa_train, IMAGE_DIR, "data/processed/qa_gpt4_incorrect/train_llava.json")
format_examples_for_llava(qpt_incorrect_qa_val, IMAGE_DIR, "data/processed/qa_gpt4_incorrect/val_llaval.json")
format_examples_for_llava(qpt_incorrect_qa_test, IMAGE_DIR, "data/processed/qa_gpt4_incorrect/test_llava.json")

In [7]:
stats_train = pd.read_json("data/processed/ts2stats_mcq_mike/train.json", lines=True)
stats_val = pd.read_json("data/processed/ts2stats_mcq_mike/val.json", lines=True)
stats_test = pd.read_json("data/processed/ts2stats_mcq_mike/test.json", lines=True)

IMAGE_DIR = "/gscratch/bdata/mikeam/TSandLanguage/data/processed/ts_as_img/all"
format_examples_for_llava(stats_train, IMAGE_DIR, "data/processed/ts2stats_mcq_mike/train_llava.json", formatter=make_llava_example_mcq)
format_examples_for_llava(stats_val, IMAGE_DIR, "data/processed/ts2stats_mcq_mike/val_llava.json", formatter=make_llava_example_mcq)
format_examples_for_llava(stats_test, IMAGE_DIR, "data/processed/ts2stats_mcq_mike/test_llava.json", formatter=make_llava_example_mcq)

0
0
0


In [21]:
counterfactual_qa_train = pd.read_json("data/processed/counterfactual_qa_mcq/train.json", lines=True)
counterfactual_qa_val = pd.read_json("data/processed/counterfactual_qa_mcq/val.json", lines=True)
counterfactual_qa_test = pd.read_json("data/processed/counterfactual_qa_mcq/test.json", lines=True)


format_examples_for_llava(counterfactual_qa_train, IMAGE_DIR, "data/processed/counterfactual_qa_mcq/train_llava.json", formatter=make_llava_example_mcq)
format_examples_for_llava(counterfactual_qa_val, IMAGE_DIR, "data/processed/counterfactual_qa_mcq/val_llava.json", formatter=make_llava_example_mcq)
format_examples_for_llava(counterfactual_qa_test, IMAGE_DIR, "data/processed/counterfactual_qa_mcq/test_llava.json", formatter=make_llava_example_mcq)

131157
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["llava_example"] = df.apply(formatter, axis=1)


100
0
19690
0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["llava_example"] = df.apply(formatter, axis=1)


In [10]:
ts2desc_mcq_train = pd.read_json("data/processed/ts2desc_mcq/train.json", lines=True)
ts2desc_mcq_val = pd.read_json("data/processed/ts2desc_mcq/val.json", lines=True)
ts2desc_mcq_test = pd.read_json("data/processed/ts2desc_mcq/test.json", lines=True)


format_examples_for_llava(ts2desc_mcq_train, IMAGE_DIR, "data/processed/ts2desc_mcq/train_llava.json", formatter=make_llava_example_desc)
format_examples_for_llava(ts2desc_mcq_val, IMAGE_DIR, "data/processed/ts2desc_mcq/val_llava.json", formatter=make_llava_example_desc)
format_examples_for_llava(ts2desc_mcq_test, IMAGE_DIR, "data/processed/ts2desc_mcq/test_llava.json", formatter=make_llava_example_desc)

5779
0
1000
0
1036
0


5779

In [18]:
len(ts2desc_mcq_train)

5779