# Milestone 1: Tackling big data on your laptop

## Authors: Neel Phaterpekar, Arash Shamseddini and Charles Suresh

In [1]:
import requests
import json
import os
from urllib.request import urlretrieve
import zipfile
import glob
import pandas as pd
import re
from memory_profiler import memory_usage
import pyarrow.dataset as ds
import pyarrow.feather as feather
import pyarrow.parquet as pq
import dask.dataframe as dd

In [3]:
%load_ext rpy2.ipython
%load_ext memory_profiler

The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython
The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


## 1. Downloading and unzipping the data

In [4]:
article_id = 14096681  # this is the unique identifier of the article on figshare
url = f"https://api.figshare.com/v2/articles/{article_id}"
headers = {"Content-Type": "application/json"}
output_directory = "figsharerainfall/"

In [9]:
%%time
%memit
response = requests.request("GET", url, headers=headers)
data = json.loads(response.text)  # this contains all the articles data, feel free to check it out
files = data["files"]             # this is just the data about the files, which is what we want
files

peak memory: 159.04 MiB, increment: 0.12 MiB
Wall time: 4.61 s


[{'is_link_only': False,
  'name': 'daily_rainfall_2014.png',
  'supplied_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'computed_md5': 'fd32a2ffde300a31f8d63b1825d47e5e',
  'id': 26579150,
  'download_url': 'https://ndownloader.figshare.com/files/26579150',
  'size': 58863},
 {'is_link_only': False,
  'name': 'environment.yml',
  'supplied_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'computed_md5': '060b2020017eed93a1ee7dd8c65b2f34',
  'id': 26579171,
  'download_url': 'https://ndownloader.figshare.com/files/26579171',
  'size': 192},
 {'is_link_only': False,
  'name': 'README.md',
  'supplied_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'computed_md5': '61858c6cc0e6a6d6663a7e4c75bbd88c',
  'id': 26586554,
  'download_url': 'https://ndownloader.figshare.com/files/26586554',
  'size': 5422},
 {'is_link_only': False,
  'name': 'data.zip',
  'supplied_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'computed_md5': 'b517383f76e77bd03755a63a8ff83ee9',
  'id': 26766812,
  'download_url': 'https://

In [12]:
%%time
files_to_dl = ["data.zip"] # need only this zip file
for file in files:
    if file["name"] in files_to_dl:
        os.makedirs(output_directory, exist_ok=True) # create the folder if not exists
        urlretrieve(file["download_url"], output_directory + file["name"])

Wall time: 10min 12s


In [13]:
%%time
%memit
with zipfile.ZipFile(os.path.join(output_directory, "data.zip"), 'r') as f:
    f.extractall(output_directory)

peak memory: 158.46 MiB, increment: 0.00 MiB
Wall time: 26.6 s


### 1.1. Summary table


| Contributors | Downloading wall time  | Extracting wall time  | 
|:---:|:-----:|:--------:|
| Neel   |   1min 10s   |  23 s      | 
| Arash   | 10min 12s    | 26.6 s         |  
| Charles   |    |          |


### 1.2. Discussing observations

> 1- Suprisingly Arash's time usage for downloading the data.zip was significantly larger than those for Neel and Charles.

## 2. Combining data CSVs

In [47]:
%%time
%memit

files = glob.glob('figsharerainfall/*.csv')

df = pd.concat((pd.read_csv(file, header=0, index_col=0)
                .assign(model=file[file.rfind("\\")+1:file.index("_daily")]) # Discuss this with others
                for file in files))

df.to_csv("figsharerainfall/combined_data.csv")

peak memory: 225.88 MiB, increment: -2.32 MiB
Wall time: 6min 10s


In [48]:
%%sh
du -sh figsharerainfall/combined_data.csv

5.7G	figsharerainfall/combined_data.csv


### 2.1. Summary table

| Contributors | Combining data wall time  | combined_data.csv memory usage  | 
|:---:|:-----:|:--------:|
| Neel   |   6min 51s   |  5.6G      | 
| Arash   | 6min 10s    | 5.7G         |  
| Charles   |    |          |


### 2.2. Discussing observations

> 1- Both run times and memory usages on different machines within the team are the same.

> Time usage: 6-7 min

> Memory usage: 5.6-5.7G

## 3. Load the combined CSV to memory and performing a simple EDA

In [49]:
%%time
df = pd.read_csv("figsharerainfall/combined_data.csv")

Wall time: 1min 10s


### 3.1. Changing dtype of data

In [59]:
print(f"Memory usage with float64: {df[['lat_min','lat_max','rain (mm/day)']].memory_usage().sum() / 1e6:.2f} MB")
print(f"Memory usage with float32: {df[['lat_min','lat_max','rain (mm/day)']].astype('float32', errors='ignore').memory_usage().sum() / 1e6:.2f} MB")

Memory usage with float64: 1500.33 MB
Memory usage with float32: 750.17 MB


In [50]:
%%time
%%memit

dataset = ds.dataset("figsharerainfall/combined_data.csv", format="csv")
table = dataset.to_table()

peak memory: 8126.89 MiB, increment: 2987.90 MiB
Wall time: 27.2 s


#### Using feather:

In [52]:
%%time
feather.write_feather(table, 'figsharerainfall/combined_data.feather')

Wall time: 3.72 s


In [53]:
%%sh
du -sh figsharerainfall/combined_data.feather

1.1G	figsharerainfall/combined_data.feather


#### Using parquet:

In [54]:
%%time 
pq.write_table(table, 'figsharerainfall/combined_data.parquet')

Wall time: 10.8 s


In [55]:
%%sh
du -sh figsharerainfall/combined_data.parquet

542M	figsharerainfall/combined_data.parquet


### 3.1. Loading data in chunks

In [58]:
%%time
%%memit
counts = pd.Series(dtype=int)
for chunk in pd.read_csv("figsharerainfall/combined_data.csv", chunksize=10_000_000):
    counts = counts.add(chunk["model"].value_counts(), fill_value=0)
print(counts.astype(int))

ACCESS-CM2          1932840
ACCESS-ESM1-5       1610700
AWI-ESM-1-1-LR       966420
BCC-CSM2-MR         3035340
BCC-ESM1             551880
CMCC-CM2-HR4        3541230
CMCC-CM2-SR5        3541230
CMCC-ESM2           3541230
CanESM5              551880
EC-Earth3-Veg-LR    3037320
FGOALS-f3-L         3219300
FGOALS-g3           1287720
GFDL-CM4            3219300
GFDL-ESM4           3219300
INM-CM4-8           1609650
INM-CM5-0           1609650
KIOST-ESM           1287720
MIROC6              2070900
MPI-ESM-1-2-HAM      966420
MPI-ESM1-2-HR       5154240
MPI-ESM1-2-LR        966420
MRI-ESM2-0          3037320
NESM3                966420
NorESM2-LM           919800
NorESM2-MM          3541230
SAM0-UNICON         3541153
TaiESM1             3541230
observed              46020
dtype: int32
peak memory: 9365.97 MiB, increment: 1352.49 MiB
Wall time: 1min 5s


### 3.1. Performing a simple EDA in Python

In [56]:
df.shape

(62513863, 7)

In [57]:
df.head()

Unnamed: 0,time,lat_min,lat_max,lon_min,lon_max,rain (mm/day),model
0,1889-01-01 12:00:00,-36.25,-35.0,140.625,142.5,3.293256e-13,ACCESS-CM2
1,1889-01-02 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
2,1889-01-03 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
3,1889-01-04 12:00:00,-36.25,-35.0,140.625,142.5,0.0,ACCESS-CM2
4,1889-01-05 12:00:00,-36.25,-35.0,140.625,142.5,0.01047658,ACCESS-CM2


### 3.2. Discussing observations

> 1- Memory usage with float32 (750.17 MB) is almost half of memory usage with float64 (1500.33 MB).

> 2- Using `feather` and `parquet` files compared to `csv` file, significantly improves both the time and memory usage.
  
> 3- Although the time usage for `parquete` is somewhat greater than that of `feather`, its memory usage is almost half of that for `feather`.
  
> 4- Loading the data in chunks results in reduced sys time and therefore the overall wall time is decreased.