In [2]:
import pandas as pd
import numpy as np

In [3]:
for type_ in ['f2', 'f4']:
    print (np.finfo(type_))
for type_ in ['i1', 'i2', 'i4']:
    print (np.iinfo(type_))

Machine parameters for float16
---------------------------------------------------------------
precision =   3   resolution = 1.00040e-03
machep =    -10   eps =        9.76562e-04
negep =     -11   epsneg =     4.88281e-04
minexp =    -14   tiny =       6.10352e-05
maxexp =     16   max =        6.55040e+04
nexp =        5   min =        -max
smallest_normal = 6.10352e-05   smallest_subnormal = 5.96046e-08
---------------------------------------------------------------

Machine parameters for float32
---------------------------------------------------------------
precision =   6   resolution = 1.0000000e-06
machep =    -23   eps =        1.1920929e-07
negep =     -24   epsneg =     5.9604645e-08
minexp =   -126   tiny =       1.1754944e-38
maxexp =    128   max =        3.4028235e+38
nexp =        8   min =        -max
smallest_normal = 1.1754944e-38   smallest_subnormal = 1.4012985e-45
---------------------------------------------------------------

Machine parameters for int8
------

In [4]:
buildings = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/building_metadata.csv.gz')
weather = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/weather_train.csv.gz')
energy = pd.read_csv('http://video.ittensive.com/machine-learning/ashrae/train.0.csv.gz')

In [5]:
print('Buildings', buildings.memory_usage().sum() / 1024**2, "Mb")
print('Weather', weather.memory_usage().sum() / 1024**2, "Mb")
print('Energy usage', energy.memory_usage().sum() / 1024**2, "Mb")

Buildings 0.06645584106445312 Mb
Weather 9.597576141357422 Mb
Energy usage 368.0698890686035 Mb


In [6]:
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo('i1').min and c_max < np.iinfo('i1').max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo('i2').min and c_max < np.iinfo('i2').max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo('i4').min and c_max < np.iinfo('i4').max:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != 'datetime':
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print("Data usage is less about", round(start_mem - end_mem, 2), "Mb (-", round(100* (start_mem - end_mem) / start_mem, 1), "%)")
    return df

In [7]:
buildings = reduce_memory_usage(buildings)
print(buildings.info())

Data usage is less about 0.05 Mb (- 73.9 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   site_id      1449 non-null   int8    
 1   building_id  1449 non-null   int16   
 2   primary_use  1449 non-null   category
 3   square_feet  1449 non-null   int32   
 4   year_built   675 non-null    float16 
 5   floor_count  355 non-null    float16 
dtypes: category(1), float16(2), int16(1), int32(1), int8(1)
memory usage: 17.8 KB
None


In [8]:
weather = reduce_memory_usage(weather)
print(weather.info())

Data usage is less about 6.53 Mb (- 68.1 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139773 entries, 0 to 139772
Data columns (total 9 columns):
 #   Column              Non-Null Count   Dtype         
---  ------              --------------   -----         
 0   site_id             139773 non-null  int8          
 1   timestamp           139773 non-null  datetime64[ns]
 2   air_temperature     139718 non-null  float16       
 3   cloud_coverage      70600 non-null   float16       
 4   dew_temperature     139660 non-null  float16       
 5   precip_depth_1_hr   89484 non-null   float16       
 6   sea_level_pressure  129155 non-null  float16       
 7   wind_direction      133505 non-null  float16       
 8   wind_speed          139469 non-null  float16       
dtypes: datetime64[ns](1), float16(7), int8(1)
memory usage: 3.1 MB
None


In [8]:
energy = reduce_memory_usage(energy)
print(energy.info())

Data usage is less about 195.54 Mb (- 53.1 %)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12060910 entries, 0 to 12060909
Data columns (total 4 columns):
 #   Column         Dtype         
---  ------         -----         
 0   building_id    int16         
 1   meter          int8          
 2   timestamp      datetime64[ns]
 3   meter_reading  float32       
dtypes: datetime64[ns](1), float32(1), int16(1), int8(1)
memory usage: 172.5 MB
None


In [9]:
energy = pd.merge(left=energy, right=buildings, how='left',
                   left_on='building_id', right_on='building_id')
energy = pd.merge(left=energy.set_index(['timestamp', 'site_id']),
                  right=weather.set_index(['timestamp', 'site_id']),
                  how='left', left_index=True, right_index=True)
energy.reset_index(inplace=True)
energy = energy.drop(columns=['site_id', 'meter'], axis=1)
print (energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12060910 entries, 0 to 12060909
Data columns (total 14 columns):
 #   Column              Dtype   
---  ------              -----   
 0   timestamp           object  
 1   building_id         int64   
 2   meter_reading       float64 
 3   primary_use         category
 4   square_feet         int32   
 5   year_built          float16 
 6   floor_count         float16 
 7   air_temperature     float16 
 8   cloud_coverage      float16 
 9   dew_temperature     float16 
 10  precip_depth_1_hr   float16 
 11  sea_level_pressure  float16 
 12  wind_direction      float16 
 13  wind_speed          float16 
dtypes: category(1), float16(9), float64(1), int32(1), int64(1), object(1)
memory usage: 540.6+ MB
None


In [10]:
print ('wind speed:', sorted(energy['wind_speed'].unique()))
print ('cloud coverage:', sorted(energy['cloud_coverage'].unique()))
print ('precip depth:', sorted(energy['precip_depth_1_hr'].unique()))

wind speed: [np.float16(nan)]
cloud coverage: [np.float16(nan)]
precip depth: [np.float16(nan)]


In [11]:
def round_fillna(df, columns):
    for col in columns:
        type_ = 'int8'
        if col in ['wind_direction', 'year_built', 'precip_depth_1_hr']:
            type_ = 'int16'
        if col == 'precip_depth_1_hr':
            df[col] = df[col].apply(lambda x: 0 if x<0 else x)
        df[col] = np.round(df[col].fillna(value=0)).astype(type_)
    return df

In [12]:
energy = round_fillna(energy, ['wind_direction', 'year_built', 'precip_depth_1_hr', 'cloud_coverage', 'wind_speed', 'floor_count'])
print(energy.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12060910 entries, 0 to 12060909
Data columns (total 14 columns):
 #   Column              Dtype   
---  ------              -----   
 0   timestamp           object  
 1   building_id         int64   
 2   meter_reading       float64 
 3   primary_use         category
 4   square_feet         int32   
 5   year_built          int16   
 6   floor_count         int8    
 7   air_temperature     float16 
 8   cloud_coverage      int8    
 9   dew_temperature     float16 
 10  precip_depth_1_hr   int16   
 11  sea_level_pressure  float16 
 12  wind_direction      int16   
 13  wind_speed          int8    
dtypes: category(1), float16(3), float64(1), int16(3), int32(1), int64(1), int8(3), object(1)
memory usage: 506.1+ MB
None


In [13]:
del buildings
del weather
del energy