# Predicting world temperature with DeepAR
- [Source](https://julsimon.medium.com/predicting-world-temperature-with-time-series-and-deepar-on-amazon-sagemaker-e371cf94ddb5)  
- [Dataset - Daily Land](http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt)

In [75]:
# import data science and visualization libraries
%matplotlib inline
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import sagemaker
import csv
import boto3
import json
from sagemaker import image_uris

print(sagemaker.__version__)

2.70.0


In [23]:
!wget -P ./data/ http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt

--2022-01-12 21:05:10--  http://berkeleyearth.lbl.gov/auto/Global/Complete_TAVG_daily.txt
Resolving berkeleyearth.lbl.gov (berkeleyearth.lbl.gov)... 128.3.29.26
Connecting to berkeleyearth.lbl.gov (berkeleyearth.lbl.gov)|128.3.29.26|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2737067 (2.6M) [text/plain]
Saving to: ‘./data/Complete_TAVG_daily.txt.1’


2022-01-12 21:05:10 (4.25 MB/s) - ‘./data/Complete_TAVG_daily.txt.1’ saved [2737067/2737067]



In [24]:
# Remove header lines (starting with a %), empty lines and lines with only spaces
!grep -v -e '^%\|^$\|^\ *$' ./data/Complete_TAVG_daily.txt > ./data/temps.txt
!head -10 ./data/temps.txt

  1880.001     1880     1     1        1      -0.815
  1880.004     1880     1     2        2       0.538
  1880.007     1880     1     3        3       0.157
  1880.010     1880     1     4        4       1.243
  1880.012     1880     1     5        5       0.396
  1880.015     1880     1     6        6       0.400
  1880.018     1880     1     7        7       0.547
  1880.021     1880     1     8        8       0.565
  1880.023     1880     1     9        9       0.338
  1880.026     1880     1    10       10       0.048


In [45]:
minYear  = 1880
maxYear  = 2021
avg_temp = 8.68

# Our model will predict temperature for the next 'prediction_length' days
prediction_length = 30

In [46]:
f = open('./data/temps.txt', 'r')
data = csv.reader(f,delimiter=' ')

In [48]:
dataset={}
x=[]
y=[]
count=1
prevYear=0

for row in data:
        # Remove empty strings caused by multiple spaces between columns
        row = list(filter(None, row))
        
        year=row[1]
        temp=float(row[5])+avg_temp
         
        # Data for plotting
        # x list=counter, y list=temperature
        x.append(count)
        y.append(float(temp))
        count += 1
        
        # Data for training
        # dictionary: key=year, value=list of ordered daily temperatures
        if (year != prevYear):
            dataset[year]=[]
            prevYear=year
        dataset[year].append(float(temp))

In [50]:
# Sometimes 'pythonic' rhymes with 'moronic' :D
nb_samples_per_year = list(map(lambda x: len(x), (dataset[str(year)] for year in range(minYear, maxYear+1))))
nb_samples_per_year = np.unique(nb_samples_per_year).tolist()

In [51]:
nb_samples_per_year

[128, 365, 366]

In [52]:
assert nb_samples_per_year == [128, 365, 366]

In [3]:
nbSamples=len(x)
print('Number of samples: %d' % nbSamples)

fig=plt.figure(figsize=(64, 16))
plt.plot(x,y)
plt.show()

NameError: name 'x' is not defined

In [54]:
trainingSet = dataset.copy()
trainingSet[year] = { year: dataset[year][:-prediction_length] for year in dataset.keys() }
testSet = dataset.copy()

In [60]:
train_key      = 'deepar_training.json'
test_key       = 'deepar_test.json'

def writeDataset(filename, data): 
    file=open(filename,'w')
    for year in data.keys():
        # One JSON sample per line
        line = "\"start\":\"{}-01-01 00:00:00\",\"target\":{}".format(year,data[year])
        file.write('{'+line+'}\n')

In [61]:
writeDataset(train_key, trainingSet)        
writeDataset(test_key, testSet)

In [None]:
!head -2 deepar_training.json

In [69]:
bucket = sagemaker.Session().default_bucket()
prefix = "deepar-daily-temperature"

train_prefix   = f'{prefix}/train'
test_prefix    = f'{prefix}/test'
output_prefix  = f'{prefix}/output'

In [70]:
sagemaker_session = sagemaker.Session()
role              = sagemaker.get_execution_role()
region            = boto3.Session().region_name

train_path  = sagemaker_session.upload_data(train_key, bucket=bucket, key_prefix=train_prefix)
test_path   = sagemaker_session.upload_data(test_key,  bucket=bucket, key_prefix=test_prefix)
output_path = f's3://bucket/output_prefix'

print(train_path)
print(test_path)
print(output_path)

s3://sagemaker-us-east-1-906545278380/deepar-daily-temperature/train/deepar_training.json
s3://sagemaker-us-east-1-906545278380/deepar-daily-temperature/test/deepar_test.json
s3://bucket/output_prefix


In [72]:
!aws s3 ls s3://{bucket}/{prefix} --recursive

2022-01-12 21:24:14     493808 deepar-daily-temperature/test/deepar_test.json
2022-01-12 21:24:14     940239 deepar-daily-temperature/train/deepar_training.json


In [87]:
container = image_uris.retrieve(framework='forecasting-deepar',region=region)
print(container)

522234722520.dkr.ecr.us-east-1.amazonaws.com/forecasting-deepar:1


In [90]:
estimator = sagemaker.estimator.Estimator(
    sagemaker_session=sagemaker_session,
    image_uri=container,
    role=role,
    instance_count=1,
    instance_type='ml.c4.8xlarge',
    base_job_name='daily-temperature',
    output_path=output_path
)

In [91]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/deepar_hyperparameters.html

hyperparameters = {
    "time_freq": 'D', # daily series
    "context_length": prediction_length,
    "prediction_length": prediction_length, # number of data points to predict
    "num_cells": "40",
    "num_layers": "2",
    "likelihood": "gaussian",
    "epochs": "250",
    "mini_batch_size": "32",
    "learning_rate": "0.00001",
    "dropout_rate": "0.05",
    "early_stopping_patience": "10" # stop if loss hasn't improved in 10 epochs
}

In [92]:
estimator.set_hyperparameters(**hyperparameters)