In [17]:
import requests
import json
import os
from urllib.request import urlretrieve
import zipfile
import glob
import pandas as pd
import re
from memory_profiler import memory_usage
import pyarrow.dataset as ds
import pyarrow.feather as feather
import pyarrow.parquet as pq
import dask.dataframe as dd

In [18]:
%load_ext rpy2.ipython
%load_ext memory_profiler

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


### Download and unzip the data

In [19]:
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharerainfall/"

In [20]:
%%time
%memit
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

peak memory: 111.89 MiB, increment: 0.22 MiB
CPU times: user 92.2 ms, sys: 126 ms, total: 218 ms
Wall time: 3.99 s


[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

In [21]:
# internet speed ?
%%time
%memit
files_to_dl = ["data.zip"] 
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True)
        urlretrieve(file["download_url"], output_directory + file["name"])

peak memory: 113.46 MiB, increment: 0.04 MiB
CPU times: user 3.07 s, sys: 2.58 s, total: 5.64 s
Wall time: 1min 10s


In [22]:
%%time
%memit
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

peak memory: 112.23 MiB, increment: 0.04 MiB
CPU times: user 16.2 s, sys: 3.05 s, total: 19.2 s
Wall time: 23 s


### Combining data CSVs

In [23]:
%%time
%memit

files = glob.glob('figsharerainfall/*.csv')

df = pd.concat((pd.read_csv(file, header=0, index_col=0)
                .assign(model=re.search(r'/(.*)_d', file)[1])
                for file in files))

df.to_csv("figsharerainfall/combined_data.csv")

peak memory: 110.33 MiB, increment: 0.07 MiB
CPU times: user 6min 15s, sys: 26.4 s, total: 6min 41s
Wall time: 6min 51s


In [26]:
%%sh
du -sh figsharerainfall/combined_data.csv

5.6G	figsharerainfall/combined_data.csv


### Load the combined CSV to memory

In [27]:
%%time
df = pd.read_csv("figsharerainfall/combined_data.csv")

CPU times: user 57.5 s, sys: 20.5 s, total: 1min 18s
Wall time: 1min 23s


### Changing `dtype` of data

In [28]:
%%time
%%memit

dataset = ds.dataset("figsharerainfall/combined_data.csv", format="csv")
table = dataset.to_table()

peak memory: 2474.00 MiB, increment: 420.98 MiB
CPU times: user 20.7 s, sys: 11.6 s, total: 32.3 s
Wall time: 29.2 s


#### Using feather:

In [29]:
%%time
feather.write_feather(table, 'figsharerainfall/combined_data.feather')

CPU times: user 5.12 s, sys: 9.3 s, total: 14.4 s
Wall time: 7.3 s


In [30]:
%%sh
du -sh figsharerainfall/combined_data.feather

1.0G	figsharerainfall/combined_data.feather


#### Using parquet:

In [31]:
%%time 
pq.write_table(table, 'figsharerainfall/combined_data.parquet')

CPU times: user 11.3 s, sys: 3.71 s, total: 15.1 s
Wall time: 16.1 s


In [32]:
%%sh
du -sh figsharerainfall/combined_data.parquet

544M	figsharerainfall/combined_data.parquet


### EDA

In [33]:
df.shape

(62513863, 7)

In [34]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.244226e-13,MPI-ESM-1-2-HAM
1,1889-01-02 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.217326e-13,MPI-ESM-1-2-HAM
2,1889-01-03 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.498125e-13,MPI-ESM-1-2-HAM
3,1889-01-04 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.251282e-13,MPI-ESM-1-2-HAM
4,1889-01-05 12:00:00,-35.439867,-33.574619,141.5625,143.4375,4.270161e-13,MPI-ESM-1-2-HAM
