In [1]:
import great_expectations as gx
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import connectorx as cx

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

In [2]:
conn_str = "mysql+pymysql://student:Student%40123%21@localhost:3306/ashrae_db"
engine = create_engine(conn_str)

uri = "mysql://student:Student%40123%21@localhost:3306/ashrae_db"

# Fact table (ConnectorX)
query = "SELECT * FROM fact_energy_usage"
df_power_meter = cx.read_sql(uri, query)

# Dimension tables
df_weather = pd.read_sql("dim_weather", con=engine)
df_building = pd.read_sql("dim_building", con=engine)

In [3]:
type(df_power_meter)

pandas.core.frame.DataFrame

In [4]:
print("GE version:", gx.__version__)

context = gx.get_context() 
print(f"Context type: {type(context).__name__}")

GE version: 1.4.4
Context type: EphemeralDataContext


In [4]:
df_power_meter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    Int64         
 1   meter          Int64         
 2   timestamp      datetime64[ns]
 3   meter_reading  float64       
dtypes: Int64(2), datetime64[ns](1), float64(1)
memory usage: 655.5 MB


In [5]:
df_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   site_id             139773 non-null  int64         
 1   timestamp           139773 non-null  datetime64[ns]
 2   air_temperature     139773 non-null  float64       
 3   cloud_coverage      114378 non-null  float64       
 4   dew_temperature     139773 non-null  float64       
 5   precip_depth_1_hr   99206 non-null   float64       
 6   sea_level_pressure  130881 non-null  float64       
 7   wind_direction      139773 non-null  float64       
 8   wind_speed          139773 non-null  float64       
 9   datetime            139773 non-null  datetime64[ns]
 10  day                 139773 non-null  int64         
 11  month               139773 non-null  int64         
 12  week                139773 non-null  int64         
dtypes: datetime64[ns](2), float64

In [6]:
df_building.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   building_id  1449 non-null   int64 
 1   site_id      1449 non-null   int64 
 2   primary_use  1449 non-null   object
 3   square_feet  1449 non-null   int64 
 4   year_built   1449 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 56.7+ KB


In [7]:
def reduce_mem_usage(df, use_float16=False):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#        else:
#            df[col] = df[col].astype("category")

    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))
    print("Decreased by {:.1f}%".format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [8]:
df_power_meter = reduce_mem_usage(df_power_meter)

Memory usage of dataframe is 655.51 MB


  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):


Memory usage after optimization is: 385.59 MB
Decreased by 41.2%


In [9]:
df_power_meter.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 4 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    float32       
 1   meter          float32       
 2   timestamp      datetime64[ns]
 3   meter_reading  float32       
dtypes: datetime64[ns](1), float32(3)
memory usage: 385.6 MB


In [10]:
df_train = df_power_meter.merge(df_building, left_on='building_id',right_on='building_id',how='left')

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 8 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    float32       
 1   meter          float32       
 2   timestamp      datetime64[ns]
 3   meter_reading  float32       
 4   site_id        int64         
 5   primary_use    object        
 6   square_feet    int64         
 7   year_built     int64         
dtypes: datetime64[ns](1), float32(3), int64(3), object(1)
memory usage: 1002.5+ MB


In [12]:
df_train = df_train.astype({
    'building_id': 'int16',
    'meter': 'int8',
    'site_id': 'int8',
    'square_feet': 'int32',
    'year_built': 'int16'
})

df_train['primary_use'] = df_train['primary_use'].astype('category')

In [13]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 8 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    int16         
 1   meter          int8          
 2   timestamp      datetime64[ns]
 3   meter_reading  float32       
 4   site_id        int8          
 5   primary_use    category      
 6   square_feet    int32         
 7   year_built     int16         
dtypes: category(1), datetime64[ns](1), float32(1), int16(2), int32(1), int8(2)
memory usage: 443.4 MB


In [14]:
chunks = []
for chunk in np.array_split(df_train, 500):
    merged = chunk.merge(
        df_weather,
        on=['site_id','timestamp'],
        how='left'
    )
    chunks.append(merged)

df_train = pd.concat(chunks, ignore_index=True)

  return bound(*args, **kwds)


In [15]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 19 columns):
 #   Column              Dtype         
---  ------              -----         
 0   building_id         int16         
 1   meter               int8          
 2   timestamp           datetime64[ns]
 3   meter_reading       float32       
 4   site_id             int8          
 5   primary_use         category      
 6   square_feet         int32         
 7   year_built          int16         
 8   air_temperature     float64       
 9   cloud_coverage      float64       
 10  dew_temperature     float64       
 11  precip_depth_1_hr   float64       
 12  sea_level_pressure  float64       
 13  wind_direction      float64       
 14  wind_speed          float64       
 15  datetime            datetime64[ns]
 16  day                 float64       
 17  month               float64       
 18  week                float64       
dtypes: category(1), datetime64[ns](2), float

In [16]:
df_train = reduce_mem_usage(df_train)

Memory usage of dataframe is 2140.03 MB


  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):
  if is_datetime(df[col]) or is_categorical_dtype(df[col]):


Memory usage after optimization is: 1368.85 MB
Decreased by 36.0%


  if is_datetime(df[col]) or is_categorical_dtype(df[col]):


In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 19 columns):
 #   Column              Dtype         
---  ------              -----         
 0   building_id         int16         
 1   meter               int8          
 2   timestamp           datetime64[ns]
 3   meter_reading       float32       
 4   site_id             int8          
 5   primary_use         category      
 6   square_feet         int32         
 7   year_built          int16         
 8   air_temperature     float32       
 9   cloud_coverage      float32       
 10  dew_temperature     float32       
 11  precip_depth_1_hr   float32       
 12  sea_level_pressure  float32       
 13  wind_direction      float32       
 14  wind_speed          float32       
 15  datetime            datetime64[ns]
 16  day                 float32       
 17  month               float32       
 18  week                float32       
dtypes: category(1), datetime64[ns](2), float