# Python Imports

Code and errors written by Thomas Martin 

In [1]:
import pandas as pd
import numpy as np
import random

from tqdm import tqdm
import glob

### File Paths:

In [2]:
import_path = '/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly/'

In [3]:
export_path = '/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/'

Getting a list of JSON files:

In [4]:
json_files = glob.glob(import_path + "*.json")

In [5]:
len(json_files)

83232

## Converting JSON files into dataframes

In [6]:
%%time

dfs = []

for file in tqdm(json_files):
    try:
        if not file.endswith('.json'):
            raise TypeError(f"File '{file}' is not a JSON file.")
        df = pd.read_json(file)
        dfs.append(df)
    except TypeError as te:
        print(te)
    except Exception as e:
        print(f"Error reading file '{file}': {str(e)}")

100%|██████████| 83232/83232 [03:04<00:00, 450.49it/s]

CPU times: user 2min 58s, sys: 6.32 s, total: 3min 5s
Wall time: 3min 4s





In [8]:
len(dfs)

83232

In [9]:
combined_df = pd.concat(dfs, ignore_index=True)
combined_df.shape

(2288269, 6)

In [10]:
mem = combined_df.memory_usage()
type(mem)

pandas.core.series.Series

This math right?

In [11]:
print('Megabytes of dataframe', mem.sum() / 1000000 )

Megabytes of dataframe 109.83704


## Precip Description

In [12]:
combined_df.shape

(2288269, 6)

In [13]:
combined_df.description.unique()

array(['Snow and/or Graupel', 'Rain', 'Ice Pellets/Sleet',
       'Freezing Drizzle', 'Mixed Ice Pellets and Snow', 'Freezing Rain',
       'Drizzle', 'Mixed Freezing Rain and Ice Pellets',
       'Mixed Rain and Snow', 'Mixed Rain and Ice Pellets', 'Graupel'],
      dtype=object)

In [14]:
my_dict = {
    'value_to_match': ['Freezing Drizzle', 'Ice Pellets', 'Mixed Ice Pellets and Snow', 'Mixed Freezing Rain and Ice Pellets', 'Mixed Rain and Snow', 'Mixed Rain and Ice Pellets', 'Drizzle', 'Graupel', 'Ice Pellets/Sleet', 'Snow and/or Graupel'],
    'new_value':      ['Freezing Rain', 'Sleet', 'Ice Pellets', 'Freezing Rain', 'Snow', 'Ice Pellets', 'Rain', 'Snow', 'Sleet', 'Snow']
}

column_to_match = 'description'

In [15]:
combined_df['duplicate'] = False

In [16]:
for num in range(len(my_dict['value_to_match'])):
    print(my_dict['value_to_match'][num])
    row_to_copy = combined_df.loc[combined_df[column_to_match] == my_dict['value_to_match'][num]].copy()
    print('dataframe shape:', row_to_copy.shape)
    print('changed to:')
    print(my_dict['new_value'][num])
    row_to_copy[column_to_match] = my_dict['new_value'][num]
    row_to_copy['duplicate'] = True
    
    combined_df = pd.concat([combined_df, row_to_copy]) # this appends that dataframe to the original
    print('   ')

Freezing Drizzle
dataframe shape: (27016, 7)
changed to:
Freezing Rain
   
Ice Pellets
dataframe shape: (0, 7)
changed to:
Sleet
   
Mixed Ice Pellets and Snow
dataframe shape: (35831, 7)
changed to:
Ice Pellets
   
Mixed Freezing Rain and Ice Pellets
dataframe shape: (9420, 7)
changed to:
Freezing Rain
   
Mixed Rain and Snow
dataframe shape: (66520, 7)
changed to:
Snow
   
Mixed Rain and Ice Pellets
dataframe shape: (31960, 7)
changed to:
Ice Pellets
   
Drizzle
dataframe shape: (245493, 7)
changed to:
Rain
   
Graupel
dataframe shape: (3959, 7)
changed to:
Snow
   
Ice Pellets/Sleet
dataframe shape: (101276, 7)
changed to:
Sleet
   
Snow and/or Graupel
dataframe shape: (624566, 7)
changed to:
Snow
   


In [17]:
combined_df[combined_df.duplicate == True].description.unique()

array(['Freezing Rain', 'Ice Pellets', 'Snow', 'Rain', 'Sleet'],
      dtype=object)

In [18]:
combined_df[combined_df.duplicate == False].description.unique()

array(['Snow and/or Graupel', 'Rain', 'Ice Pellets/Sleet',
       'Freezing Drizzle', 'Mixed Ice Pellets and Snow', 'Freezing Rain',
       'Drizzle', 'Mixed Freezing Rain and Ice Pellets',
       'Mixed Rain and Snow', 'Mixed Rain and Ice Pellets', 'Graupel'],
      dtype=object)

In [19]:
# combined_df.drop_duplicates()

In [20]:
combined_df.shape

(3434310, 7)

This would be the place to adjust/fix your precip choices.

## Coordinate Extraction

In [21]:
combined_df['coordinates'] = combined_df['geom'].apply(lambda x: x['coordinates'])

In [22]:
# Split coordinates into separate columns
combined_df['latitude'] = combined_df['coordinates'].str[1]
combined_df['longitude'] = combined_df['coordinates'].str[0]

remove three columns

In [23]:
combined_df = combined_df.drop(columns=['geom', 'coordinates', 'description_id' ])

In [24]:
combined_df.shape

(3434310, 7)

## Round down time to nearest hour

In [25]:
combined_df['date'] = pd.to_datetime(combined_df['obtime' ])

In [26]:
combined_df['rounded_datetime'] = combined_df['date'].dt.floor('H')

#dropping obtime
combined_df = combined_df.drop(columns=['obtime'])

## Pandas EDA

In [27]:
print(combined_df.dtypes)

id                                int64
category                         object
description                      object
duplicate                          bool
latitude                        float64
longitude                       float64
date                datetime64[ns, UTC]
rounded_datetime    datetime64[ns, UTC]
dtype: object


## Export Every day

In [28]:
# Group the dataframe by day
grouped = combined_df.groupby(combined_df['date'].dt.date)

In [29]:
# Iterate over the groups and export each day as a separate file
for date, group in tqdm(grouped, desc='Exporting data', unit='day'):
    filename = f'data_{date}.parquet'  # Define the filename for each day
    group.to_parquet(export_path + filename, index=False)  # Export the group to a Parquet file

Exporting data: 100%|██████████| 3427/3427 [00:20<00:00, 163.21day/s]


## Double check that each day is actually it's own day:

In [30]:
pq_files = glob.glob(export_path + "*.parquet")

In [31]:
random_files = random.sample(pq_files, 5)

In [32]:
for file in random_files:
    # Read the file using pd.read_parquet (for Parquet files)
    df = pd.read_parquet(file)
    # Display the first 3 rows of the DataFrame
    print(file)
    print(df.date.head(2))
    print(df.date.tail(2))

/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/data_2015-06-22.parquet
0   2015-06-22 13:09:00+00:00
1   2015-06-22 13:18:00+00:00
Name: date, dtype: datetime64[ns, UTC]
307   2015-06-22 19:46:00+00:00
308   2015-06-22 19:58:00+00:00
Name: date, dtype: datetime64[ns, UTC]
/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/data_2013-02-02.parquet
0   2013-02-02 14:01:00+00:00
1   2013-02-02 14:04:00+00:00
Name: date, dtype: datetime64[ns, UTC]
1478   2013-02-02 17:54:00+00:00
1479   2013-02-02 17:55:00+00:00
Name: date, dtype: datetime64[ns, UTC]
/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/data_2013-10-30.parquet
0   2013-10-30 03:04:00+00:00
1   2013-10-30 03:06:00+00:00
Name: date, dtype: datetime64[ns, UTC]
835   2013-10-30 11:48:00+00:00
836   2013-10-30 21:30:00+00:00
Name: date, dtype: datetime64[ns, UTC]
/glade/p/cisl/aiml/ai2es/winter_ptypes/mping_hourly_pq/data_2019-07-19.parquet
0   2019-07-19 06:01:21+00:00
1   2019-07-19 06:02:12+00:00
Name: date, dtyp