# Readme

In this file, we repeat the predictions on the 15 test sets using the 'rnn9' and 'rnn 9 - data a.' models, with different time sequences:

- the observed time points
- dt = 2, 4, 6, 8, 10 (followed by interpolation to match the observed time points)

# Setup

In [1]:
import torch
import torch.nn as nn 
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
import time
import random
import pickle
import json
import itertools
import importlib
import os

  from pandas.core import (


In [2]:
import sys
sys.path.append ('functions')

import functions as mf
importlib.reload(mf) 

from rnn_module import AmmoniaRNN

In [3]:
DEVICE = 'cuda:0' if torch.cuda.is_available() else 'cpu'
DEVICE

'cpu'

In [4]:
pd.set_option("display.max_columns", None)

In [5]:
num_layers = 1
nonlinearity = "relu"
bidirectional = True
mlp = True
with_embeddings = True
response = "delta_e.cum"
hidden_size = 512 

cat_dims = [5, 3, 2]  
embedding_dims = [10, 9, 8]  
input_size = 13
output_size = 1

In [6]:
torch.manual_seed(1)
model = AmmoniaRNN(input_size = input_size, 
                   output_size = output_size, 
                   hidden_size = hidden_size, 
                   nonlinearity = nonlinearity,
                   num_layers = num_layers,
                   bidirectional = bidirectional,
                   mlp = mlp,
                   with_embeddings = with_embeddings, 
                   cat_dims = cat_dims, 
                   embedding_dims = embedding_dims).to(DEVICE)

# Data

In [7]:
data = pd.read_csv("../00_data_preparation/processed_data/data_rnn_3.csv")
data = data.drop(['Unnamed: 0'], axis = 1)

In [8]:
data.head (6)

Unnamed: 0,e.cum,delta_e.cum,e.cum_shift,dt,dt_origin,inst,pmid,country,meas.tech,ct,air.temp,wind.2m,rain.rate,tan.app,app.mthd,app.rate,man.dm,man.ph,man.source,incorp,t.incorp,interpolation
0,3.574,3.574,0.0,2.0,4.0,104,182,DK,micro met,2.0,8.2,8.1,0.0,122.11,0,31.8,3.7,7.35,0,0,1000.0,yes
1,7.148,3.574,3.574,2.0,4.0,104,182,DK,micro met,4.0,8.2,8.1,0.0,122.11,0,31.8,3.7,7.35,0,0,1000.0,yes
2,7.2826,0.1346,7.148,2.0,17.0,104,182,DK,micro met,6.0,7.758824,7.615294,0.0,122.11,0,31.8,3.7,7.35,0,0,1000.0,yes
3,7.4172,0.1346,7.2826,2.0,17.0,104,182,DK,micro met,8.0,7.317647,7.130588,0.0,122.11,0,31.8,3.7,7.35,0,0,1000.0,yes
4,7.5518,0.1346,7.4172,2.0,17.0,104,182,DK,micro met,10.0,6.876471,6.645882,0.0,122.11,0,31.8,3.7,7.35,0,0,1000.0,yes
5,7.6864,0.1346,7.5518,2.0,17.0,104,182,DK,micro met,12.0,6.435294,6.161176,0.0,122.11,0,31.8,3.7,7.35,0,0,1000.0,yes


We use the dataset 'data_rnn_3.csv', which is the augmented dataset. We need to create a new variable, pmid, to identify each (trial, time sequence) pair.

In [9]:
data['seq'] = data.apply(lambda row: "real times" if row['interpolation'] == "no" else f"dt = {row['dt']}", axis=1)

data['original_pmid'] = data['pmid']

data['pmid'] = data['pmid'].astype(str) + " " + data['seq']

In [10]:
with open("../00_data_preparation/processed_data/list_test_pmids.json", "r") as f:
    list_test_pmids = json.load(f)

# Predictions with 'rnn 9 - data a.'

In [11]:
p = 15

In [12]:
predictions_list = []

for k in range (p):
    
    filename_model = f'results/05_3_model_comparison_with_data_augmentation/models/sample_{k}_model_1'
    pmids = list_test_pmids[k]
    
    predictions = data.copy()
    predictions = predictions[predictions['original_pmid'].isin(pmids)]
    
    new_pmids = predictions['pmid'].unique()
    
    predictions['prediction_ecum'] = None
    predictions['prediction_delta_ecum'] = None
        
    with torch.no_grad():
    
        all_predictions = torch.empty(0).to(DEVICE)
    
        model.load_state_dict(torch.load(filename_model + '.pth', weights_only = True, map_location=torch.device('cpu')))
    
        for i in new_pmids:
    
            x = mf.generate_tensors_predictors (predictions, i, with_embeddings, device = DEVICE)
            y = model(x)
            all_predictions = torch.cat ((all_predictions, y.squeeze()), 0)
    
        predictions['prediction_delta_ecum'] = all_predictions.to("cpu").detach()
    
    predictions['prediction_ecum'] = predictions.groupby('pmid')['prediction_delta_ecum'].cumsum()
    predictions['test_subset'] = k
    predictions_list.append (predictions)

In [13]:
all_predictions = pd.concat(predictions_list, ignore_index = True)

In [14]:
all_predictions.head (2)

Unnamed: 0,e.cum,delta_e.cum,e.cum_shift,dt,dt_origin,inst,pmid,country,meas.tech,ct,air.temp,wind.2m,rain.rate,tan.app,app.mthd,app.rate,man.dm,man.ph,man.source,incorp,t.incorp,interpolation,seq,original_pmid,prediction_ecum,prediction_delta_ecum,test_subset
0,3.676383,3.676383,0.0,2.0,4.7,104,195 dt = 2.0,DK,micro met,2.0,11.82,7.47,0.0,109.39,0,30.9,3.1,7.53,0,0,1000.0,yes,dt = 2.0,195,3.945697,3.945697,0
1,7.352766,3.676383,3.676383,2.0,4.7,104,195 dt = 2.0,DK,micro met,4.0,11.82,7.47,0.0,109.39,0,30.9,3.1,7.53,0,0,1000.0,yes,dt = 2.0,195,5.744017,1.79832,0


In [15]:
all_predictions.to_csv ("results/07_time_seq_effect/predictions_from_model_with_data_augmentation.csv", index = False)

# Predictions with 'rnn 9'

In [16]:
p = 15

In [17]:
predictions_list = []

for k in range (p):
    
    filename_model = f'results/05_1_model_comparison/models/sample_{k}_model_9'
    pmids = list_test_pmids[k]
    
    predictions = data.copy()
    predictions = predictions[predictions['original_pmid'].isin(pmids)]
    
    new_pmids = predictions['pmid'].unique()
    
    predictions['prediction_ecum'] = None
    predictions['prediction_delta_ecum'] = None
        
    with torch.no_grad():
    
        all_predictions = torch.empty(0).to(DEVICE)
    
        model.load_state_dict(torch.load(filename_model + '.pth', weights_only = True, map_location=torch.device('cpu')))
    
        for i in new_pmids:
    
            x = mf.generate_tensors_predictors (predictions, i, with_embeddings, device = DEVICE)
            y = model(x)
            all_predictions = torch.cat ((all_predictions, y.squeeze()), 0)
    
        predictions['prediction_delta_ecum'] = all_predictions.to("cpu").detach()
    
    predictions['prediction_ecum'] = predictions.groupby('pmid')['prediction_delta_ecum'].cumsum()
    predictions['test_subset'] = k
    predictions_list.append (predictions)

In [18]:
all_predictions = pd.concat(predictions_list, ignore_index = True)

In [19]:
all_predictions.head (2)

Unnamed: 0,e.cum,delta_e.cum,e.cum_shift,dt,dt_origin,inst,pmid,country,meas.tech,ct,air.temp,wind.2m,rain.rate,tan.app,app.mthd,app.rate,man.dm,man.ph,man.source,incorp,t.incorp,interpolation,seq,original_pmid,prediction_ecum,prediction_delta_ecum,test_subset
0,3.676383,3.676383,0.0,2.0,4.7,104,195 dt = 2.0,DK,micro met,2.0,11.82,7.47,0.0,109.39,0,30.9,3.1,7.53,0,0,1000.0,yes,dt = 2.0,195,2.56203,2.56203,0
1,7.352766,3.676383,3.676383,2.0,4.7,104,195 dt = 2.0,DK,micro met,4.0,11.82,7.47,0.0,109.39,0,30.9,3.1,7.53,0,0,1000.0,yes,dt = 2.0,195,5.203802,2.641772,0


In [20]:
all_predictions.to_csv ("results/07_time_seq_effect/predictions_from_model_without_data_augmentation.csv", index = False)

# End