In [54]:
import pandas as pd
import numpy as np
from pathlib import Path
import plotly.express as px
import json

In [55]:
PROJECT_ROOT = Path(r'C:\Users\aaimr\Desktop\Dev\wids-2022')
data_dir = PROJECT_ROOT / 'DATA'

In [52]:
train_file = 'train.csv'
test_file = 'test.csv'
descr_file = 'columns.json'

In [5]:
tr_df = pd.read_csv(data_dir / train_file)

In [7]:
tr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 75757 entries, 0 to 75756
Data columns (total 64 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Year_Factor                75757 non-null  int64  
 1   State_Factor               75757 non-null  object 
 2   building_class             75757 non-null  object 
 3   facility_type              75757 non-null  object 
 4   floor_area                 75757 non-null  float64
 5   year_built                 73920 non-null  float64
 6   energy_star_rating         49048 non-null  float64
 7   ELEVATION                  75757 non-null  float64
 8   january_min_temp           75757 non-null  int64  
 9   january_avg_temp           75757 non-null  float64
 10  january_max_temp           75757 non-null  int64  
 11  february_min_temp          75757 non-null  int64  
 12  february_avg_temp          75757 non-null  float64
 13  february_max_temp          75757 non-null  int

In [48]:
rows_total = tr_df.shape[0]
print(f'Total Rows: {rows_total}')

Total Rows: 75757


In [50]:
cols_missing = tr_df.isna().sum().reset_index(name="n")
cols_missing = cols_missing[cols_missing['n'] > 0]
cols_missing['p'] = cols_missing['n'] / rows_total * 100
print(f'No. of columns w/ missing values: {cols_missing.shape[0]}')

No. of columns w/ missing values: 6


In [42]:
fig = px.bar(cols_missing, 
             x='index', y='p', 
             labels={'index':'Column Names', 'p':'Percent'},
             title='Missing Value Percentage'
            )
fig.show()

In [60]:
print('COLUMN DESCRIPTIONS\n---------------')
with open(data_dir / descr_file, 'r') as js_in:
    cols_descr = json.load(js_in)
for col_nm, description in cols_descr.items():
    print(f'{col_nm.upper()}:\n\t{description}')

COLUMN DESCRIPTIONS
---------------
ID:
	building id
YEAR_FACTOR:
	anonymized year in which the weather and energy usage factors were observed
STATE_FACTOR:
	anonymized state in which the building is located
BUILDING_CLASS:
	building classification
FACILITY_TYPE:
	building usage type
FLOOR_AREA:
	floor area (in square feet) of the building
YEAR_BUILT:
	year in which the building was constructed
ENERGY_STAR_RATING:
	the energy star rating of the building
ELEVATION:
	elevation of the building location
JANUARY_MIN_TEMP:
	minimum temperature in January (in Fahrenheit) at the location of the building
JANUARY_AVG_TEMP:
	average temperature in January (in Fahrenheit) at the location of the building
JANUARY_MAX_TEMP:
	maximum temperature in January (in Fahrenheit) at the location of the building
COOLING_DEGREE_DAYS:
	cooling degree day for a given day is the number of degrees where the daily average temperature exceeds 65 degrees Fahrenheit. Each month is summed to produce an annual total at t