# Python Imports

Code and errors written by Thomas Martin 

In [1]:
import pandas as pd
import numpy as np
import random

from tqdm import tqdm
import glob

### File Paths:

In [2]:
import_path = '/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly/'

In [3]:
export_path = '/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/'

Getting a list of JSON files:

In [4]:
json_files = glob.glob(import_path + "*.json")

In [5]:
len(json_files)

83232

## Converting JSON files into dataframes

In [6]:
%%time

dfs = []

for file in tqdm(json_files):
    try:
        if not file.endswith('.json'):
            raise TypeError(f"File '{file}' is not a JSON file.")
        df = pd.read_json(file)
        dfs.append(df)
    except TypeError as te:
        print(te)
    except Exception as e:
        print(f"Error reading file '{file}': {str(e)}")

100%|██████████| 83232/83232 [03:13<00:00, 429.31it/s]

CPU times: user 3min 5s, sys: 6.22 s, total: 3min 11s
Wall time: 3min 13s





In [7]:
combined_df = pd.concat(dfs, ignore_index=True)

In [8]:
mem = combined_df.memory_usage()

In [9]:
type(mem)

pandas.core.series.Series

This math right?

In [10]:
print('Megabytes of dataframe', mem.sum() / 1000000 )

Megabytes of dataframe 109.83704


## Precip Description

In [11]:
combined_df.shape

(2288269, 6)

In [12]:
combined_df.description.unique()

array(['Snow and/or Graupel', 'Rain', 'Ice Pellets/Sleet',
       'Freezing Drizzle', 'Mixed Ice Pellets and Snow', 'Freezing Rain',
       'Drizzle', 'Mixed Freezing Rain and Ice Pellets',
       'Mixed Rain and Snow', 'Mixed Rain and Ice Pellets', 'Graupel'],
      dtype=object)

In [13]:
my_dict = {
    'value_to_match': ['Mixed Ice Pellets and Snow', 'Mixed Freezing Rain and Ice Pellets', 'Mixed Rain and Snow', 'Mixed Rain and Ice Pellets'],
    'new_value':      ['Snow', 'Ice Pellets', 'Snow', 'Ice Pellets'],
    'final_replace':   ['Ice Pellets', 'Freezing Rain', 'Rain', 'Rain']
}

column_to_match = 'description'

In [14]:
for num in range(len(my_dict['value_to_match'])):

    row_to_copy = combined_df.loc[combined_df[column_to_match] == my_dict['value_to_match'][num]].copy()
    row_to_copy[column_to_match] = my_dict['new_value'][num]
    row_to_copy.head(3)
    combined_df = pd.concat([combined_df, row_to_copy]) # this appends that dataframe to the original
    combined_df[column_to_match] = combined_df[column_to_match].replace(my_dict['value_to_match'][num], my_dict['final_replace'][num])
    

In [15]:
replacement_values = {'Drizzle': 'Rain', 
                      'Freezing Drizzle': 'Freezing Rain', 
                      'Graupel': 'Snow',
                      'Snow and/or Graupel':'Snow'}


combined_df[column_to_match].replace(replacement_values, inplace=True)

In [16]:
combined_df.description.unique()

array(['Snow', 'Rain', 'Ice Pellets/Sleet', 'Freezing Rain',
       'Ice Pellets'], dtype=object)

In [17]:
combined_df.shape

(2432000, 6)

This would be the place to adjust/fix your precip choices.

## Coordinate Extraction

In [18]:
combined_df['coordinates'] = combined_df['geom'].apply(lambda x: x['coordinates'])

In [19]:
combined_df

Unnamed: 0,id,obtime,category,description,description_id,geom,coordinates
0,1128052,2016-11-21T18:05:08Z,Rain/Snow,Snow,8,"{'type': 'Point', 'coordinates': [-72.53477752...","[-72.53477752710911, 41.78177793818802]"
1,1128055,2016-11-21T18:14:57Z,Rain/Snow,Snow,8,"{'type': 'Point', 'coordinates': [-74.0841645,...","[-74.0841645, 40.9787401]"
2,1128056,2016-11-21T18:18:31Z,Rain/Snow,Snow,8,"{'type': 'Point', 'coordinates': [-77.63962137...","[-77.63962137520093, 43.2546242745996]"
3,1128057,2016-11-21T18:25:43Z,Rain/Snow,Snow,8,"{'type': 'Point', 'coordinates': [-72.6836217,...","[-72.6836217, 41.7416863]"
4,1128058,2016-11-21T18:29:49Z,Rain/Snow,Snow,8,"{'type': 'Point', 'coordinates': [-73.9366576,...","[-73.9366576, 41.6538321]"
...,...,...,...,...,...,...,...
2287963,2284801,2020-02-04T20:38:06Z,Rain/Snow,Ice Pellets,11,"{'type': 'Point', 'coordinates': [-81.80157678...","[-81.80157678244151, 41.45023211277623]"
2287970,2284809,2020-02-04T20:41:17Z,Rain/Snow,Ice Pellets,11,"{'type': 'Point', 'coordinates': [-81.81254732...","[-81.81254732179195, 41.43437502772101]"
2287988,2284828,2020-02-04T20:53:43Z,Rain/Snow,Ice Pellets,11,"{'type': 'Point', 'coordinates': [-122.6487560...","[-122.64875600512225, 47.55681921738768]"
2288076,1851566,2019-01-28T13:57:41Z,Rain/Snow,Ice Pellets,11,"{'type': 'Point', 'coordinates': [-86.39447498...","[-86.39447498367167, 39.52435935741578]"


In [20]:
# Split coordinates into separate columns
combined_df['latitude'] = combined_df['coordinates'].str[1]
combined_df['longitude'] = combined_df['coordinates'].str[0]

remove three columns

In [21]:
combined_df = combined_df.drop(columns=['geom', 'coordinates', 'description_id' ])

## Round down time to nearest hour

In [22]:
combined_df['date'] = pd.to_datetime(combined_df['obtime' ])

In [23]:
combined_df['rounded_datetime'] = combined_df['date'].dt.floor('H')

#dropping obtime
combined_df = combined_df.drop(columns=['obtime'])

## Pandas EDA

In [24]:
print(combined_df.dtypes)

id                                int64
category                         object
description                      object
latitude                        float64
longitude                       float64
date                datetime64[ns, UTC]
rounded_datetime    datetime64[ns, UTC]
dtype: object


## Export Every day

In [25]:
# Group the dataframe by day
grouped = combined_df.groupby(combined_df['date'].dt.date)

In [26]:
# Iterate over the groups and export each day as a separate file
for date, group in tqdm(grouped, desc='Exporting data', unit='day'):
    filename = f'data_{date}.parquet'  # Define the filename for each day
    group.to_parquet(export_path + filename, index=False)  # Export the group to a Parquet file

Exporting data: 100%|██████████| 3427/3427 [00:20<00:00, 169.09day/s]


## Double check that each day is actually it's own day:

In [27]:
pq_files = glob.glob(export_path + "*.parquet")

In [28]:
random_files = random.sample(pq_files, 5)

In [29]:
for file in random_files:
    # Read the file using pd.read_parquet (for Parquet files)
    df = pd.read_parquet(file)
    # Display the first 3 rows of the DataFrame
    print(file)
    print(df.date.head(2))
    print(df.date.tail(2))

/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/data_2021-05-15.parquet
0   2021-05-15 13:11:14+00:00
1   2021-05-15 13:17:26+00:00
Name: date, dtype: datetime64[ns, UTC]
367   2021-05-15 06:49:54+00:00
368   2021-05-15 01:09:21+00:00
Name: date, dtype: datetime64[ns, UTC]
/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/data_2014-02-01.parquet
0   2014-02-01 03:01:00+00:00
1   2014-02-01 03:02:00+00:00
Name: date, dtype: datetime64[ns, UTC]
925   2014-02-01 14:18:00+00:00
926   2014-02-01 09:11:00+00:00
Name: date, dtype: datetime64[ns, UTC]
/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/data_2014-12-27.parquet
0   2014-12-27 05:01:00+00:00
1   2014-12-27 05:03:00+00:00
Name: date, dtype: datetime64[ns, UTC]
950   2014-12-27 20:40:00+00:00
951   2014-12-27 20:59:00+00:00
Name: date, dtype: datetime64[ns, UTC]
/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/data_2022-02-02.parquet
0   2022-02-02 07:00:32+00:00
1   2022-02-02 07:00:38+00:00
Name: date, dtype: