# Analyze NGA-West2 database

In [1]:
import os 
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Load database flatfile

The original Excel file is from https://peer.berkeley.edu/research/databases/databases. It is easier to load from a Parquet format. So the script checks for a Parquet copy and uses that if possible. Otherwise it loads the original file and saves it as a Parquet copy for future use.

In [6]:
# Define file paths
excel_path = '../data/Updated_NGA_West2_Flatfile_RotD50_d050_public_version.xlsx'
parquet_path = '../data/nga_w2_flatfile.parquet' 
csv_path = '../data/nga_w2_flatfile.csv' 

# Check if the Parquet file exists
if os.path.exists(parquet_path):
    nga_w2_flatfile = pd.read_parquet(parquet_path)
else:
    print(f"Parquet file not found. Loading data from Excel file: {excel_path}")
    nga_w2_flatfile = pd.read_excel(excel_path)
    
    # convert Object columns to strings
    object_columns = nga_w2_flatfile.select_dtypes(include=['object']).columns
    nga_w2_flatfile[object_columns] = nga_w2_flatfile[object_columns].astype('string')
    
    # Save to Parquet for future use
    nga_w2_flatfile.to_parquet(parquet_path)


nga_w2_flatfile.to_csv(csv_path, index=False)    