In [2]:
from os.path import join
import pandas as pd
import numpy as np
import seaborn as sns

In [5]:

def builds_dictionary(house_list_dir):
    buildings_list = []
    buildings = {}
    list_of_buildings = pd.read_csv(join(house_list_dir,'house_list_AC_area.csv'))
    for index,row in list_of_buildings.iterrows():
        buildings = {
                     "dataid":int(row['dataid']),
                    "start_period":row['date_enrolled'],
                    "end_period":row['date_withdrawn'],
                    "year_constructed":row['house_construction_year'],
                    "total_area":row['total_square_footage'],
                    "first_floor_area":row['first_floor_square_footage'],
                    "second_floor_area":row['second_floor_square_footage'],
                    "third_floor_area":row['third_floor_square_footage'],}
        buildings_list.append(buildings)
    return buildings_list


In [6]:
def find_between( s, first, last ):
    try:
        start = s.index( first ) + len( first )
        end = s.index( last, start )
        return s[start:end]
    except ValueError:
        return ""

In [7]:
def load_weather_data(path,weather_file_name,column,index):
    weather_file = join(path +weather_file_name )
    #removing the time zone information from index and using localhour
    df = pd.read_csv(weather_file,sep=';',index_col=index,parse_dates=[index], date_parser=lambda x: pd.to_datetime(x.rpartition('-')[0]))
    #df.index = pd.to_datetime(df.index)#.tz_localize(UTC)#.tz_convert('US/Eastern')
    df = df[column]
    return df

In [8]:
def load_irradiance_data(path,irradiance_file_name,column,index):
    irradiance_file = join(path+irradiance_file_name)
    df = pd.read_csv(irradiance_file,sep=';',index_col=index)
    df.index = pd.to_datetime(df.index)
    df = df[column]
    return df

In [9]:
def normalize(df):
    return (df-df.min())/(df.max()-df.min())

In [1]:
def features_creation(df):
    # creatures time based features from pandas dataframe
    # such hour of day, weekday/weekend, day/night and so on
    # sin hour and cos hour as just indirect representation of time of day
    df['sin_hour'] = np.sin((df.index.hour)*2*np.pi/24)
    df['cos_hour'] = np.cos((df.index.hour)*2*np.pi/24)#later try 24 vector binary format
    df['hour'] = df.index.hour # 0 to 23
    df['day_of_week'] = df.index.dayofweek #Monday = 0, sunday = 6
    df['weekend'] = [ 1 if day in (5, 6) else 0 for day in df.index.dayofweek ] # 1 for weekend and 0 for weekdays
    df['month'] = df.index.month
    df['week_of_year'] = df.index.week
    # day = 1 if(10Hrs -19Hrs) and Night = 0 (otherwise)
    df['day_night'] = [1 if day<20 and day>9 else 0 for day in df.index.hour ]
    return df 

In [3]:
def lag_column(df,column_names,lag_period=1):
#df              > pandas dataframe
#column_names    > names of column/columns as a list
#lag_period      > number of steps to lag ( +ve or -ve) usually postive 
#to include past values for current row 
    for column_name in column_names:
        column_name = [str(column_name)]
        for i in np.arange(1,lag_period+1,1):
            new_column_name = [col +'_'+str(i) for col in column_name]
            df[new_column_name]=(df[column_name]).shift(i)
    return df

In [4]:
def plot_correlation(df,selected_columns=None,annot=False):
    '''Function plots a graphical correlation matrix for each pair of columns in the dataframe.
    Input:
        df: pandas DataFrame
        selected_columns: if you want to select columns containing 1 or more substring
        Example: selected_columns="air" - selects columns air conditioner and such
        selected columns = "air|temp" - selects columns containing air conditioner and temperature'''
        
    if selected_columns==None:
        df_correlation = df.corr()
    else:
        df_correlation = df[df.filter(regex=selected_columns).columns].corr()
    fig = plt.figure()
    plot = fig.add_axes()
    plot = sns.heatmap(df_correlation, annot=annot)
    plot.xaxis.tick_top() 
    plt.yticks(rotation=0)
    plt.xticks(rotation=90)
    plt.show()

In [8]:
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
class accuracy_metrics:
    def coeff_var(self,df,actual_col,predicted_col):
        y_actual_mean = df[actual_col].mean()
        mse = mean_squared_error(df[actual_col],df[predicted_col])
        return np.sqrt(mse)/y_actual_mean
    def mean_bias_err(self,df,actual_col,predicted_col):
        y_actual_mean = df[actual_col].mean()
        return mean_absolute_error(df[actual_col],df[predicted_col])/y_actual_mean
    def r2_score(self,df,actual_col,predicted_col):
        return r2_score(df[actual_col],df[predicted_col])

In [1]:
def get_metrics(df):
    metrics = accuracy_metrics()
    cv = metrics.coeff_var(df,df.columns[1],df.columns[0])*100
    mbe = (metrics.mean_bias_err(df,df.columns[1],df.columns[0])*100)
    r2 = metrics.r2_score(df,df.columns[1],df.columns[0])
    print_metrics(cv,mbe,r2)
    return cv, mbe, r2

In [1]:
def print_metrics(cv,mbe,r2):
    print "coefficient of variance = {:.2f}".format(cv)
    print "Mean bias error = {:.2f}".format(mbe)
    print "R Squared = {:.3f}".format(r2)