In [15]:
import pandas as pd
from pandas.errors import OutOfBoundsDatetime
from matplotlib import pyplot as plt

from src.utils import read_jsonl, write_jsonl
from src.visualization.visualize import paper_mpl_env, title
import textwrap
import numpy as np
import os

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## GPT-4 Scenarios

In [2]:
# Prompt asks for shorter summaries and simpler siuations
v2_data = pd.read_json('data/processed/ts2desc/v2.jsonl',lines=True)

### Extract Categories

In [3]:
categories = pd.read_json("categorized_output.json",lines=True)[["uuid","category"]]
categories.rename(columns={"category":"scenario_category"}, inplace=True)
categories["scenario_category"] = categories["scenario_category"].str.split("Category:").str[-1].str.strip().str.removeprefix("'").str.removesuffix("'")

top_ten = categories["scenario_category"].value_counts().head(10).index.to_list()
categories = categories[categories["scenario_category"].isin(top_ten)]
categories.to_csv("data/processed/scenario_categories.csv",index=False)

In [4]:
categorised_data = v2_data.join(categories.set_index("uuid"),on="uuid", how="inner").drop_duplicates(subset=["uuid"])

In [5]:
# Get df with one of each category
one_of_each = categorised_data.groupby("scenario_category").first().reset_index()

### Examples

In [21]:
with paper_mpl_env():    
    fig, axes = plt.subplots(5, 2, figsize=(16, 10))
    axes = axes.flatten()
    i=0
    i_start=0
    n_plotted = 0
    while n_plotted < 10:
        sample = one_of_each.iloc[i]
        ts = sample['series']
        desciption = sample['description']
        desciption_tiny = title(sample['description_tiny'])
        start = sample['metadata']['start']
        end = sample['metadata']['end']
        units = title(sample['metadata']['units'])
        category = sample['scenario_category']

        try:
            x = pd.date_range(start=start, end=end, periods=len(ts))
        except OutOfBoundsDatetime:
            x = range(len(ts))
            print("Warning: OutOfBoundsDatetime")
            i+=1
            continue
        

        axes[n_plotted].plot(x,ts)
        axes[n_plotted].set_title(desciption_tiny.strip())
        axes[n_plotted].text(1.2, 0.90, category, fontsize=10, transform=axes[n_plotted].transAxes, fontweight='bold', ha='left')
        # Put bold text that says "Description: above the description"
        axes[n_plotted].text(1.2, 0.80, "Description:", fontsize=10, transform=axes[n_plotted].transAxes, fontweight='bold', ha='left')
        # Put description in box to the right of each subplot (with text wrapping)
        axes[n_plotted].text(1.2, 0.75, textwrap.fill(desciption.strip(), 50), fontsize=10, transform=axes[n_plotted].transAxes, va='top')
        # Set xticks to just first and last
        axes[n_plotted].set_xticks([x[0], x[-1]])
        axes[n_plotted].set_ylabel(units)
        i += 1
        n_plotted += 1

    plt.tight_layout()
    plt.savefig("reports/2024/one_of_each.pdf")


### Donut Plot

In [65]:

with paper_mpl_env():
    fig, ax = plt.subplots(figsize=(4, 2))
    category_portions = categorised_data["scenario_category"].value_counts(normalize=True)
    plt.pie(category_portions, labels=category_portions.index, autopct='%1.1f%%', startangle=0, 
            wedgeprops={'edgecolor': 'white', 'linewidth': 1},
            textprops={'fontsize': 8},
            pctdistance=0.85)
    # add a circle at the center to transform it in a donut chart
    my_circle=plt.Circle( (0,0), 0.7, color='white')
    p=plt.gcf()
    p.gca().add_artist(my_circle)

    plt.tight_layout()
    plt.axis('equal')  # Equal aspect ratio ensures that the pie is drawn as a circle
    plt.title('Scenario Category Portion', fontsize=14, fontweight='bold')
    plt.savefig("reports/2024/category_portion.pdf", bbox_inches='tight')


### Split QA Pairs

In [67]:
!realpath data/processed/QA

/mmfs1/gscratch/bdata/datasets/llms_and_timeseries/QA


In [29]:
QA_df = pd.read_json("data/processed/QA/all_mcq.json",lines=True)
QA_df.join(categories.set_index("uuid"),on="uuid", how="inner")

KeyboardInterrupt: 

In [28]:
[print(QA_df[QA_df["category"] == "description"]["question"].iloc[i]) for i in range(100,110)]

What average sealevel air pressure in hPa was used for initializing the values?
What deviation in hPa was hypothetically assumed for the pressure readings?
How many random storm days were included in the month?
How many readings were collected in total over the 1month period at a rate of 1 reading every 30 minutes?
How was the fluctuation for storm activity incorporated into the data?
What type of data is being recorded?
How frequently is the data sampled?
What is the impact of the cyclone event on the data?
What is the typical pattern of the data throughout the year?
Are there any days with zero rainfall in the data?


[None, None, None, None, None, None, None, None, None, None]

## Statistical MCQ

In [26]:
stat_mcqs = read_jsonl("/gscratch/bdata/datasets/llms_and_timeseries/ts2stats_mcq/train.json")

In [27]:
results = []
for mcq in stat_mcqs:
    mcq["label"] = mcq["options"][mcq["answer_index"]]
    results.append(mcq)
write_jsonl(results, "data/processed/ts2stats_mcq/train.json")

## Spliting Descriptions by Category

In [None]:
df = pd.read_json('path/to/file.jsonl', lines=True)
