<a href="https://colab.research.google.com/github/antonpolishko/A_colab_collection/blob/master/Transfer_Effects.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Examining Transfer Effects

In [None]:
import os
import pandas as pd
from google.colab import auth
from datetime import datetime
auth.authenticate_user()
!gcloud source repos clone github_aistream-peelout_flow-forecast --project=gmap-997
os.chdir('/content/github_aistream-peelout_flow-forecast')
!git checkout -t plot_confidence_interval_in_wandb
!python setup.py develop
!pip install -r requirements.txt
!mkdir data
from flood_forecast.trainer import train_function
!pip install git+https://github.com/CoronaWhy/task-geo.git
!wandb login

$ git clone https://github.com/AIStream-Peelout/flow-forecast
Cloning into '/content/github_aistream-peelout_flow-forecast'...
remote: Total 3770 (delta 2433), reused 3770 (delta 2433)[K
Receiving objects: 100% (3770/3770), 2.68 MiB | 14.30 MiB/s, done.
Resolving deltas: 100% (2433/2433), done.
Project [gmap-997] repository [github_aistream-peelout_flow-forecast] was cloned to [/content/github_aistream-peelout_flow-forecast].
Branch 'covid_fixes' set up to track remote branch 'covid_fixes' from 'origin'.
Switched to a new branch 'covid_fixes'
running develop
running egg_info
creating flood_forecast.egg-info
writing flood_forecast.egg-info/PKG-INFO
writing dependency_links to flood_forecast.egg-info/dependency_links.txt
writing requirements to flood_forecast.egg-info/requires.txt
writing top-level names to flood_forecast.egg-info/top_level.txt
writing manifest file 'flood_forecast.egg-info/SOURCES.txt'
package init file 'flood_forecast/__init__.py' not found (or not a regular file)
pac

In [None]:
!gsutil cp -r gs://coronaviruspublicdata/experiments .

Copying gs://coronaviruspublicdata/experiments/13_May_202004_35PM.json...
Copying gs://coronaviruspublicdata/experiments/13_May_202004_35PM_model.pth...
Copying gs://coronaviruspublicdata/experiments/13_May_202004_36PM.json...
Copying gs://coronaviruspublicdata/experiments/13_May_202004_36PM_model.pth...
- [4 files][  5.4 MiB/  5.4 MiB]                                                
==> NOTE: You are performing a sequence of gsutil operations that may
run significantly faster if you instead use gsutil -m cp ... Please
see the -m section under "gsutil help options" for further information
about when gsutil -m can be advantageous.

Copying gs://coronaviruspublicdata/experiments/13_May_202004_37PM.json...
Copying gs://coronaviruspublicdata/experiments/13_May_202004_37PM_model.pth...
Copying gs://coronaviruspublicdata/experiments/13_May_202004_38PM.json...
Copying gs://coronaviruspublicdata/experiments/13_May_202004_38PM_model.pth...
Copying gs://coronaviruspublicdata/experiments/13_May_2

In [None]:
def make_config_file(file_path, df_len, weight_path=None):
  run = wandb.init(project="covid-forecast")
  wandb_config = wandb.config
  train_number = df_len * .7
  validation_number = df_len *.9
  config_default={                 
    "model_name": "MultiAttnHeadSimple",
    "model_type": "PyTorch",
    "model_params": {
      "number_time_series":4,
      "seq_len":wandb_config["forecast_history"], 
      "output_seq_len":wandb_config["out_seq_length"],
      "forecast_length":wandb_config["out_seq_length"]
     },
     "weight_path_add":{
         "excluded_layers":["last_layer.weight", "last_layer.bias"]
     },
    "dataset_params":
    {  "class": "default",
       "training_path": file_path,
       "validation_path": file_path,
       "test_path": file_path,
       "batch_size":wandb_config["batch_size"],
       "forecast_history":wandb_config["forecast_history"],
       "forecast_length":wandb_config["out_seq_length"],
       "train_end": int(train_number),
       "valid_start":int(train_number+1),
       "valid_end": int(validation_number),
       "target_col": ["new_cases"],
       "relevant_cols": ["new_cases", "month", "weekday", "year"],
       "scaler": "StandardScaler", 
       "interpolate": False
    },
    "training_params":
    {
       "criterion":"MSE",
       "optimizer": "Adam",
       "optim_params":
       {

       },
       "lr": wandb_config["lr"],
       "epochs": 10,
       "batch_size":wandb_config["batch_size"]
    
    },
    "GCS": False,
    
    "sweep":True,
    "wandb":False,
    "forward_params":{},
   "metrics":["MSE"],
   "inference_params":
   {     
         "datetime_start":"2020-04-21",
          "hours_to_forecast":10, 
          "test_csv_path":file_path,
          "decoder_params":{
              "decoder_function": "simple_decode", 
            "unsqueeze_dim": 1
          },
          "dataset_params":{
             "file_path": file_path,
             "forecast_history":wandb_config["forecast_history"],
             "forecast_length":wandb_config["out_seq_length"],
             "relevant_cols": ["new_cases", "month", "weekday", "year"],
             "target_col": ["new_cases"],
             "scaling": "StandardScaler",
             "interpolate_param": False
          }
      }
  }
  if weight_path: 
    config_default["weight_path"] = weight_path
  wandb.config.update(config_default)
  return config_default

sweep_config = {
  "name": "Default sweep",
  "method": "grid",
  "parameters": {
        "batch_size": {
            "values": [2, 3, 4, 5]
        },
        "lr":{
            "values":[0.001, 0.002, 0.004, 0.01]
        },
        "forecast_history":{
            "values":[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
        },
        "out_seq_length":{
            "values":[1, 2, 3]
        }
    }
}

In [None]:
def format_corona_data(region_df:pd.DataFrame, region_name:str):
  """
  Format data for a specific region into 
  a format that can be used with flow forecast. 
  """
  if region_name == 'county':
    region_name = region_df['full_county'].iloc[0]
  else:
    region_name = region_df['state'].iloc[0]
  #else:
    #region_name = region_df['country'].iloc[0]
  print(region_name)
  region_df['datetime'] = region_df['date']
  region_df['precip'] = 0
  region_df['temp'] = 0
  region_df = region_df.fillna(0)
  region_df['new_cases'] = region_df['cases'].diff()
  region_df.iloc[0]['new_cases'] = 0
  region_df= region_df.fillna(0)
  region_df.to_csv(region_name+".csv")
  return region_df, len(region_df), region_name+".csv"

def loop_through_geo_codes(df, column='full_county'):
  df_county_list = []
  df['full_county'] = df['state'] + "_" + df['county'] 
  for code in df['full_county'].unique():
    mask = df['full_county'] == code
    df_code = df[mask]
    ts_count = len(df_code)
    if ts_count > 60:
      df_county_list.append(df_code)
  return df_county_list 

def fetch_time_series() -> pd.DataFrame:
    """Fetch raw time series data from coronadatascraper.com
    Returns:
        pd.DataFrame: raw timeseries data at county/sub-region level
    """
    if 1==1:
        url = "https://coronadatascraper.com/timeseries.csv"
        urllib.request.urlretrieve(url, "timeseries.csv")

    time_series_df = pd.read_csv("timeseries.csv")
    return time_series_df
!ls experiments

13_May_202004_35PM.json       13_May_202005_34PM_model.pth
13_May_202004_35PM_model.pth  13_May_202005_35PM.json
13_May_202004_36PM.json       13_May_202005_35PM_model.pth
13_May_202004_36PM_model.pth  13_May_202005_36PM.json
13_May_202004_37PM.json       13_May_202005_36PM_model.pth
13_May_202004_37PM_model.pth  13_May_202005_37PM.json
13_May_202004_38PM.json       13_May_202005_37PM_model.pth
13_May_202004_38PM_model.pth  13_May_202005_38PM.json
13_May_202004_39PM.json       13_May_202005_38PM_model.pth
13_May_202004_39PM_model.pth  13_May_202005_39PM.json
13_May_202004_40PM.json       13_May_202005_39PM_model.pth
13_May_202004_40PM_model.pth  13_May_202005_40PM.json
13_May_202004_41PM.json       13_May_202005_40PM_model.pth
13_May_202004_41PM_model.pth  13_May_202005_41PM.json
13_May_202004_42PM.json       13_May_202005_41PM_model.pth
13_May_202004_42PM_model.pth  13_May_202005_42PM.json
13_May_202004_43PM.json       13_May_202005_42PM_model.pth
13_May_202004_43PM_model.pth  13_May_

In [None]:
import urllib 
df = fetch_time_series()
df['month'] = pd.to_datetime(df['date']).map(lambda x: x.month)
df['weekday'] = pd.to_datetime(df['date']).map(lambda x: x.weekday())
df_list = loop_through_geo_codes(df)
weight_path = "experiments/13_May_202006_32PM_model.pth"
special_city_list1 = ["New York_New York County", "California_Los Angeles County", "Illinois_Cook County", "Arizona_Maricopa County", "Massachusetts_Middlesex County", "Texas_Dallas County", "Texas_Harris County", "Florida_Miami Dade County", "California_Riverside County", "Colorado_Denver County", "Ohio_Cuyahoga County", "New York_Queens County", "New York_Bronx County"]

  if self.run_code(code, result):


In [None]:
import wandb
sweep_id = wandb.sweep(sweep_config, project="covid-forecast")
#10_May_202001_12PM_model.pth
query = "full_county=='{}'"
for city in special_city_list1:
  region_df, full_len, file_path = format_corona_data(df.query(query.format(city)), 'county')
  wandb.agent(sweep_id, lambda:train_function("PyTorch", make_config_file(file_path, full_len, weight_path=weight_path)))


Create sweep with ID: kj03slcw
Sweep URL: https://app.wandb.ai/igodfried/covid-forecast/sweeps/kj03slcw
New York_New York County


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable

wandb: Agent Starting Run: hup0o89s with config:
	batch_size: 2
	forecast_history: 1
	lr: 0.001
	out_seq_length: 1
wandb: Agent Started Run: hup0o89s


sucessfully deleted layers


Process Process-1:
Traceback (most recent call last):
  File "/usr/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/usr/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/local/lib/python3.6/dist-packages/wandb/wandb_agent.py", line 64, in _start
    function()
  File "<ipython-input-16-554f24166fcd>", line 7, in <lambda>
    wandb.agent(sweep_id, lambda:train_function("PyTorch", make_config_file(file_path, full_len, weight_path=weight_path)))
  File "/content/github_aistream-peelout_flow-forecast/flood_forecast/trainer.py", line 27, in train_function
    trained_model = PyTorchForecast(params["model_name"], dataset_params["training_path"], dataset_params["validation_path"], dataset_params["test_path"], params)
  File "/content/github_aistream-peelout_flow-forecast/flood_forecast/time_model.py", line 87, in __init__
    super().__init__(model_base, training_data, validation_data

California_Los Angeles County


KeyboardInterrupt: ignored

In [None]:
import torch
state_dict = torch.load("10_May_202004_35PM_model.pth")

In [None]:
state_dict

OrderedDict([('dense_shape.weight',
              tensor([[ 0.2922,  0.5831,  0.2123,  0.2958],
                      [-0.1401,  0.2656,  0.0496,  0.1917],
                      [-0.1451, -0.5410,  0.3328,  0.2463],
                      [ 0.0234,  0.0324, -0.3318, -0.2799],
                      [ 0.2017,  0.5305,  0.0124, -0.2474],
                      [ 0.0903, -0.3962, -0.1219,  0.0283],
                      [-0.4816,  0.2876, -0.1178, -0.3142],
                      [-0.3137,  0.2560, -0.2838,  0.0824],
                      [ 0.1107,  0.1075, -0.1542,  0.2204],
                      [ 0.4234,  0.2333,  0.1399, -0.0660],
                      [ 0.3076,  0.3471, -0.0755, -0.2816],
                      [-0.3710,  0.4123,  0.3036, -0.3267],
                      [-0.2920,  0.1289,  0.0660, -0.0643],
                      [ 0.5747, -0.3106,  0.3993,  0.4106],
                      [-0.5778,  0.2996, -0.1030, -0.2161],
                      [ 0.3630,  0.0339, -0.0280, -0.3495],
    

In [None]:
!gsutil cp gs://coronaviruspublicdata/pretrained/10_May_202001_12PM_model.pth .

Copying gs://coronaviruspublicdata/pretrained/10_May_202001_12PM_model.pth...
/ [1 files][  2.7 MiB/  2.7 MiB]                                                
Operation completed over 1 objects/2.7 MiB.                                      


In [None]:
!ls

 10_May_202004_35PM_model.pth   setup.py
 data			        site_metadata.json
 flood_forecast		        tests
 flood_forecast.egg-info        timeseries.csv
 model_save		        wandb
 README.md		       'Washington, D.C._District of Columbia.csv'
 requirements.txt
