In [1]:
import numpy as np
import pandas as pd
import pickle

In [2]:
def load_datasets(filename="dataset.pkl"):
    with open(filename, "rb") as f:
        data = pickle.load(f)
    return data["datasets"], data["time_points"]

---

# Twitter

We randomly collected streaming tweets starting at 50 arbitrary timestamps lasting 7 days during the 2010-2014 flu seasons, then counted the frequency of disease-related keywords in the collected tweets, and then infer the flu level in each state.  

- A sample of data in each domain represents the word frequency of a state in the current 7-day window of tweets.  
- A domain represents the status of all states in a current 7-day time window.  
- Label is whether there has been a flu outbreak in that state.  
- The timestamp of the domain indicates the number of weeks the current domain is from the first domain.

In [3]:
datasets, time_points = load_datasets('./Twitter/dataset.pkl')

In [4]:
domain_0_X, domain_0_Y = datasets[0][0], datasets[0][1]
domain_0_X.shape, domain_0_Y.shape

((51, 526), (51,))

In [5]:
domain_0 = pd.DataFrame(domain_0_X)
domain_0.columns = ['state', 'flu', 'swine', 'stomach', 'symptoms', 'virus', 'bug', 'strep', 'season', 'influenza', 'fever', 'thera', 'poisoning', 'pneumonia', 'bird', 'infection', 'epidemic', 'week', 'bronchitis', 'sucks', 'immune', 'soon', 'colds', 'caught', 'medicine', 'soup', 'weekend', 'dying', 'recovering', 'weeks', 'meds', 'sick', 'outbreak', 'cough', 'sickness', 'strain', 'jab', 'kicking', 'catch', 'catching', 'bed', 'contagious', 'days', 'aches', 'yesterday', 'battling', 'cure', 'coming', 'doctor', 'rest', 'worst', 'tomorrow', 'case', 'illness', 'hours', 'due', 'worse', 'system', 'ini', 'weight', 'sinus', 'body', 'germs', 'morning', 'hangover', 'death', 'awful', 'hospital', 'fighting', '12s', 'away', 'feeling', 'sakit', 'horrible', 'miserable', 'since', 'ebola', 'lagi', 'die', 'chills', 'finals', 'vaccination', 'pounds', 'weather', 'nasty', 'played', 'hour', 'healthy', 'sicker', 'winter', 'finally', 'plague', 'spreading', 'food', 'avian', 'tablets', 'joke', 'pigs', 'severe', 'ridden', 'missed', 'terrible', 'puking', 'disease', 'ago', 'mono', 'coughing', 'tea', 'vaccine', 'office', 'ill', 'deaths', 'school', 'died', 'viruses', 'hoping', 'cold', 'home', 'fluids', 'throwing', 'spread', 'recover', 'cramps', 'games', 'rid', 'tested', 'summer', 'shot', 'house', 'mask', 'early', 'month', 'gave', 'vaccinated', 'gotten', 'slime', 'fun', 'felt', 'crap', 'suffering', 'boost', 'vitamin', 'haven', 'couple', 'starting', 'chicken', 'antibiotics', 'least', 'gym', 'killing', 'full', 'pandemic', 'test', 'doc', 'respiratory', 'sleep', 'pray', 'kill', 'mild', 'lots', 'hungover', 'whatever', 'missing', '102', 'kick', 'temp', 'hopefully', 'luck', 'couch', 'taking', 'glad', 'achy', 'diagnosed', 'recovery', 'working', 'poor', 'family', 'recovered', 'ache', 'health', 'mau', 'year', 'diet', 'sore', 'remedy', 'viral', 'drink', 'doctors', 'nurse', 'positive', 'tummy', 'kicked', 'officially', 'ready', 'weak', 'birthday', 'vomiting', 'nausea', 'course', 'fight', 'aku', 'helps', 'avoid', 'kena', 'serious', 'years', 'quarantine', 'three', 'masks', 'clinic', 'months', 'cancelled', 'woke', 'cases', 'nih', 'rough', 'stay', 'passed', 'vacation', 'remedies', 'rather', 'infected', 'goal', 'past', 'spring', 'daughter', 'mist', '103', 'sanitizer', 'contracted', 'bit', 'energy', 'pain', 'news', 'wants', 'sort', 'etc', 'noodle', '100', 'later', 'zombie', 'class', 'hear', 'trip', 'twice', '000', 'infections', 'quarantined', 'entire', 'survived', 'common', 'possible', 'pills', 'fly', 'itu', 'lbs', 'aching', 'strains', 'pass', 'swag', 'puke', 'butt', 'slept', 'hrs', 'workout', 'infect', 'care', 'normal', 'thank', 'medication', 'able', 'hands', 'condom', 'headache', 'bring', 'praying', 'wear', 'ish', 'staying', 'eat', 'migraine', 'around', 'symptom', 'plus', 'vitamins', 'kind', 'thanks', 'break', 'playing', 'swear', 'nauseous', 'warm', 'dreaded', 'kids', 'sleeping', 'marathon', 'prevent', 'feelin', 'allergies', 'hot', 'beat', 'husband', 'miss', 'gettin', 'aids', 'needs', 'suck', 'half', 'official', 'wash', 'likely', 'deadly', 'bottle', 'sounds', 'infectious', 'nya', 'vai', 'far', 'visit', 'probably', 'wife', 'play', 'hubby', 'pig', 'juice', 'sister', 'shots', 'taken', 'plans', 'tired', 'already', 'lucky', 'jadi', 'prayers', 'couldn', 'lost', 'knocked', 'vaccines', 'brought', 'brother', 'next', 'survive', 'risk', 'household', 'threw', 'gak', 'awesome', 'stomache', 'literally', '1st', 'wake', 'ate', 'definitely', 'started', 'udah', 'almost', 'supposed', 'holiday', 'dehydrated', 'cramp', 'related', 'sama', 'super', 'appetite', 'jordan', 'heard', 'wish', 'lose', 'mend', 'snow', 'practice', 'afternoon', 'round', 'leave', 'slowly', 'outside', 'works', 'kills', 'wanted', 'spent', 'late', 'wasn', 'commercial', 'dose', '104', 'shitty', 'panas', 'killed', 'shift', 'cancer', 'yet', 'dealing', 'needed', 'throw', 'whole', 'pissed', 'awake', 'room', 'syrup', 'till', 'busy', 'struck', 'quickly', 'freaking', 'drinking', 'million', 'nap', 'water', 'classes', 'safe', 'helping', 'fine', 'pregnant', 'juga', 'pox', '2014', '2009', 'less', 'birds', '2013', 'feels', 'pra', 'worked', 'throat', 'near', 'two', 'laying', 'confirmed', 'enough', 'ear', 'cancel', 'bisa', 'gini', 'activity', 'illnesses', 'fast', 'currently', 'pains', 'episode', 'goin', 'diarrhea', 'loss', 'beginning', 'tonsillitis', 'turned', 'killer', 'barely', '101', 'wine', 'lemon', 'pretty', 'type', 'sweating', 'welcome', 'either', 'came', 'dan', 'banget', 'cures', 'ginger', 'eaten', 'cup', 'onset', 'helped', 'effects', 'crappy', 'induced', 'managed', 'exhausted', 'kalo', 'stayed', 'injury', 'bro', 'holidays', 'immunity', 'dehydration', 'bgt', 'forward', 'stuff', 'dad', '24hr', 'four', 'hospitalized', 'players', 'giving', 'goodness', 'decided', 'quite', 'wouldn', 'stupid', 'everywhere', '2nd', 'boss', 'drop', 'waiting', 'healing', 'ive', 'rain', 'dizzy', 'free', 'yay', 'mate', 'thru', 'quick', 'emergency', 'apparently', 'excuse', 'tylenol', 'bugs', 'hasn', 'vomit', 'tonight', 'degree', 'careful', 'lovely', 'wonderful', 'combo', 'tests', 'thinks', 'ankle', 'work', 'hand', 'complications', 'children', 'start', 'aja']
# the first 'state' feature is a categorical feature
domain_0['label'] = domain_0_Y
domain_0 

Unnamed: 0,state,flu,swine,stomach,symptoms,virus,bug,strep,season,influenza,...,tests,thinks,ankle,work,hand,complications,children,start,aja,label
0,0,147,9,2,1,2,3,2,5,0,...,3,0,0,7,1,0,0,1,0,1
1,1,28,2,0,0,1,1,0,0,2,...,1,0,0,0,0,0,1,0,0,0
2,2,168,12,7,4,1,2,2,4,4,...,4,0,0,4,0,1,2,5,0,0
3,3,97,2,3,1,1,0,0,3,4,...,1,0,0,2,0,1,0,0,0,1
4,4,1202,103,75,49,24,15,3,50,31,...,17,1,0,33,1,2,14,10,0,0
5,5,146,3,11,3,1,2,1,6,12,...,2,0,0,4,0,0,2,0,0,0
6,6,77,9,0,0,0,0,0,1,1,...,1,0,0,2,0,0,2,0,0,0
7,7,27,1,0,0,1,0,0,0,2,...,0,0,0,0,0,0,0,0,0,0
8,8,185,13,7,1,6,2,2,10,16,...,3,0,0,2,0,1,2,1,0,0
9,9,546,123,13,7,2,2,1,24,17,...,5,0,0,8,1,0,11,5,0,0


In [6]:
time_points # weeks from the first domain

array([  0.        ,   1.14285714,   2.        ,   2.71428571,
         4.28571429,  47.71428571,  55.71428571,  95.28571429,
        96.28571429,  98.        ,  98.42857143,  99.28571429,
       100.85714286, 101.57142857, 102.28571429, 103.        ,
       104.14285714, 104.85714286, 105.85714286, 106.85714286,
       108.71428571, 146.85714286, 148.71428571, 149.28571429,
       150.85714286, 151.57142857, 152.85714286, 153.85714286,
       154.14285714, 155.        , 155.85714286, 157.14285714,
       160.        , 197.57142857, 198.57142857, 200.42857143,
       200.85714286, 202.        , 203.        , 204.        ,
       205.        , 206.        , 206.85714286, 207.42857143,
       208.71428571, 209.28571429, 210.42857143, 211.        ,
       212.28571429, 212.85714286])

---

# YearBook

We randomly sampled 40 years of data from the 84-year YearBook (1930 to 2013) dataset, with each year representing a domain.  
(raw YearBook dataset: https://wild-time.github.io/#dataset)

- A sample of data in each domain represents a frontal-facing image with shape [32, 32, 1].  
- A domain represents all images collected in this year.  
- Label is gender.  
- The timestamp of the domain indicates the years of the current domain is from the first domain.

In [7]:
datasets, time_points = load_datasets('./YearBook/dataset.pkl')

In [8]:
domain_0_X, domain_0_Y = datasets[0][0], datasets[0][1]
domain_0_X.shape, domain_0_Y.shape

((154, 32, 32, 1), (154,))

In [9]:
time_points

array([ 0,  3,  5,  6,  7, 13, 15, 18, 20, 21, 24, 25, 26, 27, 28, 31, 33,
       34, 36, 38, 39, 42, 45, 48, 49, 50, 51, 53, 54, 55, 59, 60, 64, 65,
       68, 69, 70, 73, 74, 78])

---

# Cyclone

The date of occurrence of the hurricane represents a domain. We focused on cyclone data from the West Pacific region covering 2014 to 2016.  
(raw Cyclone dataset: https://www.csie.ntu.edu.tw/~htlin/program/TCIR/)  

- A sample of data in each domain represents a Tropical Cyclone image with shape [64, 64, 2].  
- A domain represents all Cyclones happened in that date.  
- Label is wind intensity.  
- The timestamp of the domain indicates the date of the current domain is from the first domain.

In [10]:
datasets, time_points = load_datasets('./Cyclone/dataset.pkl')

In [11]:
domain_0_X, domain_0_Y = datasets[0][0], datasets[0][1]
domain_0_X.shape, domain_0_Y.shape

((61, 64, 64, 2), (61,))

In [12]:
time_points

array([  0,   5,  17,  26,  46,  58,  68,  73,  80,  81, 104, 111, 115,
       135, 159, 169, 171, 172, 173, 179, 186, 187, 191, 193, 195, 196,
       197, 199, 201, 205, 207, 209, 213, 214, 218, 220, 224, 232, 236,
       237, 241, 243, 244, 245, 248, 252, 254, 255, 258, 259, 261, 263,
       264, 266, 267, 273, 274, 275, 277, 281, 291, 295, 301, 303, 309,
       316, 317, 322, 333, 336, 343, 350])

---

# House

We randomly collected streaming house price data starting at 40 arbitrary timestamps lasting 30 days during 2013-2-1 to 2019-7-27. The data include features of 'postcode', 'propertyType' and 'bedrooms'. We onehot the 'postcode' and 'propertyType', and norm the 'bedrooms' feature.  
(raw House dataset: https://github.com/BaiTheBest/DRAIN/blob/main/regression/data/HousePrice/raw_sales.csv)

- A sample of data in each domain represents a sale record.  
- A domain represents all sale records in a current 30-day time window.  
- Label is house price ($10000 units).  
- The timestamp of the domain indicates the number of days the current domain is from the first domain.

In [13]:
datasets, time_points = load_datasets('./House/dataset.pkl')

In [14]:
domain_0_X, domain_0_Y = datasets[0][0], datasets[0][1]
domain_0_X.shape, domain_0_Y.shape

((165, 30), (165,))

In [15]:
time_points

array([   0,   89,  253,  348,  381,  399,  529,  533,  744,  765,  772,
        811,  846,  852,  901,  936,  950, 1120, 1273, 1279, 1305, 1385,
       1424, 1475, 1529, 1560, 1650, 1667, 1877, 1901, 1941, 1944, 1961,
       2009, 2145, 2157, 2173, 2182, 2248, 2271])