In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import glob


In [2]:
#importing data files for all the months separately
#POWER
path=r"/Users/chandu/Desktop/IOT DATATHON/Chiller"
df_p_c1=[pd.read_csv(file,index_col="ts") for file in glob.glob(path+ "/Chiller_1/P/*.csv")]

#TEMP
df_t_c1=[pd.read_csv(file,index_col="ts") for file in glob.glob(path+ "/Chiller_1/T/*.csv")]

#EVAPORATOR
df_e_c1=[pd.read_csv(file,index_col="ts") for file in glob.glob(path+ "/Chiller_1/E/*.csv")]

#CONDENSER
df_c_c1=[pd.read_csv(file,index_col="ts") for file in glob.glob(path+ "/Chiller_1/C/*.csv")]

#TEMPERATURE
df_temp=[pd.read_csv(file,index_col="ts") for file in glob.glob(path+ "/Weather/Temp/*.csv")]

#HUMID
df_humid=[pd.read_csv(file,index_col="ts") for file in glob.glob(path+ "/Weather/Humid/*.csv")]

#concatenating the dataframes and subsetting the columns
#POWER
tab_p_c1=pd.concat(df_p_c1).sort_index()
tab_p_c1=tab_p_c1[["ch1Watt","ch2Watt","ch3Watt","totalPositiveWattHour"]]

#TEMP
tab_t_c1=pd.concat(df_t_c1).sort_index()
tab_t_c1=tab_t_c1[["value1","value2","value3","value4"]]
tab_t_c1.columns=["temp1","temp2","temp3","temp4"]

#EVAPORATOR
tab_e_c1=pd.concat(df_e_c1).sort_index()
tab_e_c1=tab_e_c1[["flowRate"]]
tab_e_c1.columns=["e.flowRate"]

#CONDENSER
tab_c_c1=pd.concat(df_c_c1).sort_index()
tab_c_c1=tab_c_c1[["flowRate"]]
tab_c_c1.columns=["c.flowRate"]

#WEATHER_TEMPERATURE
tab_temp=pd.concat(df_temp).sort_index()
tab_temp=tab_temp[["value"]]
tab_temp.columns=["w_temp"]

#WEATHER HUMIDITY
tab_humid=pd.concat(df_humid).sort_index()
tab_humid=tab_humid[["value"]]
tab_humid.columns=["w_humid"]

#creating a list for dataframes
tables=[tab_p_c1,tab_t_c1,tab_e_c1,tab_c_c1,tab_humid,tab_temp]

#changing the index to datetimeindex
for i in tables:
    i.index=pd.DatetimeIndex(i.index)
    i.index=i.index.map(lambda x:x.replace(second=0,microsecond=0))
    
#removing duplicates
tab_p_c1=tab_p_c1.reset_index().drop_duplicates(subset="ts",keep="first").set_index("ts")
tab_t_c1=tab_t_c1.reset_index().drop_duplicates(subset="ts",keep="first").set_index("ts")
tab_e_c1=tab_e_c1.reset_index().drop_duplicates(subset="ts",keep="first").set_index("ts")
tab_c_c1=tab_c_c1.reset_index().drop_duplicates(subset="ts",keep="first").set_index("ts")

tab_temp = tab_temp.reset_index().drop_duplicates(subset="ts",keep="first").set_index("ts")
tab_humid= tab_humid.reset_index().drop_duplicates(subset="ts",keep="first").set_index("ts")

## Understand the data

In [3]:
#merging the datafiles
c1=tab_p_c1.merge(tab_t_c1,how="inner",left_index=True,right_index=True).merge(tab_e_c1,how="inner",left_index=True,right_index=True).merge(tab_c_c1,how="inner",left_index=True,right_index=True)

In [9]:
c1.head()

Unnamed: 0_level_0,ch1Watt,ch2Watt,ch3Watt,t3-t2,t4-t1,cooling_cap,e.flowRate,c.flowRate,Postive_Watt_hour
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-05-01 00:01:00,383,0,396,5.650331,6.472781,0.0,0.0,35.284023,0.0
2017-05-01 00:02:00,379,0,394,5.650331,6.387328,0.0,0.0,144.330154,0.0
2017-05-01 00:03:00,389,0,408,4.443362,5.319215,4.015153,10.812524,195.770782,0.0
2017-05-01 00:04:00,5748,4480,4942,2.883911,3.930666,21.157226,77.101921,214.631348,100.0
2017-05-01 00:07:00,34147,30316,31827,3.06549,3.492736,54.308153,222.726486,225.320724,2700.0


In [4]:
#feature engineering

# 1.calculate positive watt hour
c1["shift_watt_hour"]=c1["totalPositiveWattHour"].shift(1)
c1["Postive_Watt_hour"]=c1["totalPositiveWattHour"]-c1["shift_watt_hour"]

#calculate temperature difff
c1["t3-t2"]=c1["temp3"]-c1["temp2"]
c1["t4-t1"]=c1["temp4"]-c1["temp1"]

#cooling capacity
c1["cooling_cap"]=999.68844162593*c1['e.flowRate']*0.001/60*4.19*c1["t4-t1"]

#subsetting only required columns
cols=['ch1Watt', 'ch2Watt', 'ch3Watt', "t3-t2","t4-t1","cooling_cap",
       'e.flowRate', 'c.flowRate','Postive_Watt_hour']
c1=c1[cols]

#removing outliers
c1=c1[(c1['Postive_Watt_hour']>=0) & (c1['Postive_Watt_hour']<20000)]
c1=c1[c1["t3-t2"]>=0]
c1=c1[c1["t4-t1"]>=0]

#extacting min ,max,mean for cooling capacity for 5 min
c1_5min=c1[cols].resample("5T").agg(["min","max","mean"])

#renaming the columns
c1_5min.columns=["ch1.min","ch1.max","ch1.mean",
                "ch2.min","ch2.max","ch2.mean",
                "ch3.min","ch3.max","ch3.mean",
                "t3-t2.min","t3-t2.max","t3-t2.mean",
                "t4-t1.min","t4-t1.max","t4-t1.mean",
                "cooling_cap.min","cooling_cap.max","cooling_cap.mean",
                "e.flowRate.min","e.flowRate.max","e.flowRate.mean",
                "c.flowRate.min","c.flowRate.max","c.flowRate.mean",
                "Postive_Watt_hour.min","Postive_Watt_hour.max","Postive_Watt_hour.mean"]

#drop missing values
c1_5min=c1_5min.dropna()


#merging the dataframes
df_c1=c1.merge(c1_5min,how="outer",left_index=True,right_index=True)
df_c1=c1_5min.drop([ "Postive_Watt_hour.min","Postive_Watt_hour.max"],axis=1)

#extracting time features
df_c1["month"]=df_c1.index.month
df_c1["hour"]=df_c1.index.hour
df_c1["day"]=df_c1.index.day
df_c1["minute"]=df_c1.index.minute

In [5]:
df_c1.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 57945 entries, 2017-05-01 00:00:00 to 2017-11-30 23:55:00
Data columns (total 29 columns):
ch1.min                   57945 non-null float64
ch1.max                   57945 non-null float64
ch1.mean                  57945 non-null float64
ch2.min                   57945 non-null float64
ch2.max                   57945 non-null float64
ch2.mean                  57945 non-null float64
ch3.min                   57945 non-null float64
ch3.max                   57945 non-null float64
ch3.mean                  57945 non-null float64
t3-t2.min                 57945 non-null float64
t3-t2.max                 57945 non-null float64
t3-t2.mean                57945 non-null float64
t4-t1.min                 57945 non-null float64
t4-t1.max                 57945 non-null float64
t4-t1.mean                57945 non-null float64
cooling_cap.min           57945 non-null float64
cooling_cap.max           57945 non-null float64
cooling_cap.mean       

In [6]:
df_c1.head()

Unnamed: 0_level_0,ch1.min,ch1.max,ch1.mean,ch2.min,ch2.max,ch2.mean,ch3.min,ch3.max,ch3.mean,t3-t2.min,...,e.flowRate.max,e.flowRate.mean,c.flowRate.min,c.flowRate.max,c.flowRate.mean,Postive_Watt_hour.mean,month,hour,day,minute
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-01 00:00:00,379.0,5748.0,1724.75,0.0,4480.0,1120.0,394.0,4942.0,1535.0,2.883911,...,77.101921,21.978611,35.284023,214.631348,147.504077,25.0,5,0,1,0
2017-05-01 00:05:00,34147.0,57287.0,45019.0,30316.0,53878.0,41457.333333,31827.0,54900.0,42731.0,3.06549,...,228.414627,226.089371,225.320724,226.439087,225.877004,2333.333333,5,0,1,5
2017-05-01 00:10:00,57993.0,63503.0,60164.5,54349.0,59886.0,56529.25,55423.0,61457.0,57804.25,7.060241,...,228.420212,228.255238,226.457489,228.977615,227.14957,3750.0,5,0,1,10
2017-05-01 00:15:00,65888.0,67088.0,66524.4,62304.0,63991.0,63179.6,63689.0,65116.0,64335.2,9.143066,...,228.868256,228.619876,227.044342,228.608566,227.806711,3180.0,5,0,1,15
2017-05-01 00:20:00,64688.0,67112.0,66087.0,60841.0,63530.0,62540.666667,62012.0,64741.0,63717.333333,11.311342,...,228.206741,227.667918,229.382843,231.392929,230.257395,5233.333333,5,0,1,20


In [10]:
df_c1.shape

(57945, 29)