In [1]:
import time

import pandas as pd
import numpy as np
import tensorflow as tf

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (20, 16)

print("Package is ready.")

  from ._conv import register_converters as _register_converters


Package is ready.


In [2]:
RESULT_PATH = './results/'

In [3]:
DATA_RANGE = pd.date_range('2016-10-02 00:00', '2018-05-31 23:50', freq='10min')

In [4]:
raw_data = pd.read_csv('./data/env_merged.csv', index_col='N1 날짜', low_memory=False)
raw_data = raw_data.replace(" ", np.nan)
raw_data.astype("float")
raw_data.drop('기상센서\n강우', axis=1, inplace=True)

In [5]:
raw_data['N1 날짜'] = raw_data.index
raw_data.index = pd.DatetimeIndex(raw_data.index)
raw_data = raw_data.loc[DATA_RANGE]

In [6]:
raw_data.interpolate('linear', limit=5, inplace=True)
raw_data.iloc[0, :] = raw_data.iloc[1, :]

In [7]:
DATA_wR = [] #Sensor with or without root zone
DATA_woR = []
DATA_wR_prc = []
DATA_woR_prc = []
DATA_wR.append(raw_data.iloc[:,:9])
DATA_woR.append(raw_data.iloc[:,9:15])
DATA_wR.append(raw_data.iloc[:,15:24])
DATA_woR.append(raw_data.iloc[:,24:30])
DATA_wR.append(raw_data.iloc[:,30:39])
DATA_wR.append(raw_data.iloc[:,39:48])
DATA_wR.append(raw_data.iloc[:,48:57])
DATA_woR.append(raw_data.iloc[:,57:63])
DATA_wR.append(raw_data.iloc[:,63:72])
DATA_weather = raw_data.iloc[:,72:]

In [8]:
for df in DATA_wR:
    df['N1 날짜'] = raw_data['N1 날짜']
    df = pd.concat([df['N1 날짜'].str.split(" ", expand=True), df.iloc[:, :]], axis=1)
    df = pd.concat([df[0].str.split("-", expand=True), df.iloc[:, 1:]], axis=1)
    df.drop('N1 날짜', axis = 1, inplace=True)
    COLUMNS = ['year', 'month', 'day', 'time', 'temp', 'rh', 's_temp', 's_rh', 'ec', 'co2', 'atm', 'uv', 'rad']
    df.columns = COLUMNS
    df['time'] = pd.to_datetime(df['time'])
    df['time'] = df['time'].dt.hour*60 + df['time'].dt.minute + df['time'].dt.second/60
    DATA_wR_prc.append(df)
    
for df in DATA_woR:
    df['N1 날짜'] = raw_data['N1 날짜']
    df = pd.concat([df['N1 날짜'].str.split(" ", expand=True), df.iloc[:, :]], axis=1)
    df = pd.concat([df[0].str.split("-", expand=True), df.iloc[:, 1:]], axis=1)
    df.drop('N1 날짜', axis = 1, inplace=True)
    COLUMNS = ['year', 'month', 'day', 'time', 'temp', 'rh', 'co2', 'atm', 'uv', 'rad']
    df.columns = COLUMNS
    df['time'] = pd.to_datetime(df['time'])
    df['time'] = df['time'].dt.hour*60 + df['time'].dt.minute + df['time'].dt.second/60
    DATA_woR_prc.append(df)
    
DATA_weather['N1 날짜'] = raw_data['N1 날짜']
DATA_weather = pd.concat([DATA_weather['N1 날짜'].str.split(" ", expand=True), DATA_weather.iloc[:, :]], axis=1)
DATA_weather = pd.concat([DATA_weather[0].str.split("-", expand=True), DATA_weather.iloc[:, 1:]], axis=1)
DATA_weather.drop('N1 날짜', axis = 1, inplace=True)
COLUMNS = ['year', 'month', 'day', 'time', 'temp', 'rh', 'uv', 'wd', 'wv', 'atm']
DATA_weather.columns = COLUMNS
DATA_weather['time'] = pd.to_datetime(DATA_weather['time'])
DATA_weather['time'] = DATA_weather['time'].dt.hour*60 + DATA_weather['time'].dt.minute + DATA_weather['time'].dt.second/60

In [9]:
FEATURE_MAP_wR = {
    'temp': ['month', 'day', 'time', 'rh', 'atm', 'rad', 'co2'],
    'rh': ['month', 'day', 'time', 'temp', 's_temp', 's_rh', 'rad'],
    's_temp': ['month', 'day', 'time', 'temp', 'rh', 's_rh', 'rad'],
    's_rh': ['month', 'day', 'time', 'temp', 'rh', 's_temp'],
    'ec': ['month', 'day', 'time', 's_temp', 's_rh', 'rad', 'co2'],
    'atm': ['month', 'day', 'time', 'temp', 'rh'],
    'rad': ['month', 'day', 'time', 'atm', 'temp', 'rh', 'co2'],
    'co2': ['month', 'day', 'time', 'temp', 'rh', 'atm', 'rad'],
}

FEATURE_MAP_woR = {
    'temp': ['month', 'day', 'time', 'rh', 'atm', 'rad', 'co2'],
    'rh': ['month', 'day', 'time', 'temp', 'rad'],
    'atm': ['month', 'day', 'time', 'temp', 'rh'],
    'rad': ['month', 'day', 'time', 'atm', 'temp', 'rh', 'co2'],
    'co2': ['month', 'day', 'time', 'temp', 'rh', 'atm', 'rad'],
}

In [10]:
i = 1
k = 0
for df in DATA_wR_prc:
    start = time.time()
    print("----sensor %d w/R data----" % i)
    short_loss = df[~df.iloc[:,4:].isnull().all(axis=1)]
    short_loss = short_loss[short_loss.isnull().any(axis=1)]
    for column in df.columns[4:]: #wo/ time, select each factor
        if column == 'uv':
            continue
        train_data = df.dropna()[FEATURE_MAP_wR.get(column)]
        target_data = short_loss[FEATURE_MAP_wR.get(column)]

        train_y = df.dropna()[column]
        train_x = train_data
        target_y = short_loss[column]
        target_y = target_y[target_y.isnull()]
        target_x = target_data.loc[target_y.index].dropna()
        target_y = target_y.loc[target_x.index]
        
        if len(target_y) == 0:
            print("%s has no loss." % column)
            continue
            
        regressor = RandomForestRegressor(n_estimators=100)
        regressor.fit(train_x, train_y)
        

        pred_y = regressor.predict(target_x)
        pred_y = pd.DataFrame({column: pd.Series(pred_y, index=target_y.index)})
        short_loss.loc[target_y.index, column] = pred_y
        print("%s done." % column)
    run_time = time.time() - start
    print("Run time: %.2f sec." % run_time)
    DATA_wR_prc[k].loc[short_loss.index] = short_loss
    k += 1
    i += 1

print("")
i = 1
k = 0
for df in DATA_woR_prc:
    start = time.time()
    print("----sensor %d wo/R data----" % i)
    short_loss = df[~df.iloc[:,4:].isnull().all(axis=1)]
    short_loss = short_loss[short_loss.isnull().any(axis=1)]
    for column in df.columns[4:]: #wo/ time, select each factor
        if column == 'uv':
            continue
        train_data = df.dropna()[FEATURE_MAP_woR.get(column)]
        target_data = short_loss[FEATURE_MAP_woR.get(column)]

        train_y = df.dropna()[column]
        train_x = train_data
        target_y = short_loss[column]
        target_y = target_y[target_y.isnull()]
        target_x = target_data.loc[target_y.index].dropna()
        target_y = target_y.loc[target_x.index]

        if len(target_y) == 0:
            print("%s has no loss." % column)
            continue

        regressor = RandomForestRegressor(n_estimators=100)
        regressor.fit(train_x, train_y)
        

        pred_y = regressor.predict(target_x)
        pred_y = pd.DataFrame({column: pd.Series(pred_y, index=target_y.index)})
        short_loss.loc[target_y.index, column] = pred_y
        print("%s done." % column)
    run_time = time.time() - start
    print("Run time: %.2f sec." % run_time)
    DATA_woR_prc[k].loc[short_loss.index] = short_loss
    k += 1
    i += 1
print("All interpolations are done.")

----sensor 1 w/R data----
temp has no loss.
rh done.
s_temp done.
s_rh done.
ec done.
co2 done.
atm done.
rad has no loss.
Run time: 85.22 sec.
----sensor 2 w/R data----
temp done.
rh done.
s_temp done.
s_rh done.
ec done.
co2 done.
atm done.
rad has no loss.
Run time: 98.18 sec.
----sensor 3 w/R data----
temp has no loss.
rh done.
s_temp done.
s_rh done.
ec done.
co2 done.
atm done.
rad has no loss.
Run time: 87.08 sec.
----sensor 4 w/R data----
temp done.
rh done.
s_temp done.
s_rh done.
ec done.
co2 done.
atm done.
rad has no loss.
Run time: 113.73 sec.
----sensor 5 w/R data----
temp done.
rh done.
s_temp done.
s_rh done.
ec done.
co2 done.
atm done.
rad has no loss.
Run time: 95.77 sec.
----sensor 6 w/R data----
temp done.
rh has no loss.
s_temp done.
s_rh done.
ec done.
co2 done.
atm done.
rad has no loss.
Run time: 97.94 sec.

----sensor 1 wo/R data----
temp done.
rh done.
co2 done.
atm done.
rad has no loss.
Run time: 65.82 sec.
----sensor 2 wo/R data----
temp done.
rh has no lo

In [11]:
for df in DATA_wR_prc:
    df.interpolate('linear', limit=3, inplace=True)
    
for df in DATA_woR_prc:
    df.interpolate('linear', limit=3, inplace=True)

DATA_weather.interpolate('linear', inplace=True)
DATA_woR_prc.append(DATA_weather)

In [12]:
processed_data = pd.concat([
                                            DATA_wR_prc[0], DATA_woR_prc[0],
                                            DATA_wR_prc[1], DATA_woR_prc[1],
                                            DATA_wR_prc[2], DATA_wR_prc[3],
                                            DATA_wR_prc[4], DATA_woR_prc[2],
                                            DATA_wR_prc[5], DATA_woR_prc[3]], axis=1)

In [13]:
print(processed_data.columns[:50])
print(processed_data.columns[50:])

Index(['year', 'month', 'day', 'time', 'temp', 'rh', 's_temp', 's_rh', 'ec',
       'co2', 'atm', 'uv', 'rad', 'year', 'month', 'day', 'time', 'temp', 'rh',
       'co2', 'atm', 'uv', 'rad', 'year', 'month', 'day', 'time', 'temp', 'rh',
       's_temp', 's_rh', 'ec', 'co2', 'atm', 'uv', 'rad', 'year', 'month',
       'day', 'time', 'temp', 'rh', 'co2', 'atm', 'uv', 'rad', 'year', 'month',
       'day', 'time'],
      dtype='object')
Index(['temp', 'rh', 's_temp', 's_rh', 'ec', 'co2', 'atm', 'uv', 'rad', 'year',
       'month', 'day', 'time', 'temp', 'rh', 's_temp', 's_rh', 'ec', 'co2',
       'atm', 'uv', 'rad', 'year', 'month', 'day', 'time', 'temp', 'rh',
       's_temp', 's_rh', 'ec', 'co2', 'atm', 'uv', 'rad', 'year', 'month',
       'day', 'time', 'temp', 'rh', 'co2', 'atm', 'uv', 'rad', 'year', 'month',
       'day', 'time', 'temp', 'rh', 's_temp', 's_rh', 'ec', 'co2', 'atm', 'uv',
       'rad', 'year', 'month', 'day', 'time', 'temp', 'rh', 'uv', 'wd', 'wv',
       'atm'],
      

In [14]:
processed_data = processed_data.drop(['year', 'month', 'day', 'time'], axis=1)

In [15]:
processed_data.columns = ['temp.1', 'rh.1', 's_temp.1', 's_rh.1', 'ec.1', 'co2.1', 'atm.1', 'uv.1', 'rad.1', 
                          'temp.2', 'rh.2', 'co2.2', 'atm.2', 'uv.2', 'rad.2',
                          'temp.3', 'rh.3', 's_temp.3', 's_rh.3', 'ec.3', 'co2.3', 'atm.3', 'uv.3', 'rad.3',
                          'temp.4', 'rh.4', 'co2.4', 'atm.4', 'uv.4', 'rad.4',
                          'temp.5', 'rh.5', 's_temp.5', 's_rh.5', 'ec.5', 'co2.5', 'atm.5', 'uv.5', 'rad.5',
                          'temp.6', 'rh.6', 's_temp.6', 's_rh.6', 'ec.6', 'co2.6', 'atm.6', 'uv.6', 'rad.6',
                          'temp.7', 'rh.7', 's_temp.7', 's_rh.7', 'ec.7', 'co2.7', 'atm.7', 'uv.7', 'rad.7',
                          'temp.8', 'rh.8', 'co2.8', 'atm.8', 'uv.8', 'rad.8',
                          'temp.9', 'rh.9', 's_temp.9', 's_rh.9', 'ec.9', 'co2.9', 'atm.9', 'uv.9', 'rad.9',
                          'temp.w', 'rh.w', 'uv.w', 'wd.w', 'wv.w', 'atm.w']

In [16]:
backup = processed_data.iloc[:,:]

In [17]:
for column in processed_data.columns:
    print("%s column is in processing..." % column)
    selected_column = processed_data[column]
    interpolation = selected_column[selected_column.isnull()]
    interpolation.index = pd.DatetimeIndex(interpolation.index)
    temp = interpolation[interpolation.isnull()]
    temp2 = pd.DatetimeIndex(temp.index)
    delta = pd.Timedelta('1day')
    while len(interpolation[interpolation.isnull()]) != 0:
        temp = interpolation[interpolation.isnull()]
        temp2 = pd.DatetimeIndex(temp.index) - delta
        interpolation[temp.index] = processed_data.loc[temp2, column]
        delta += pd.Timedelta('5day')
    print(delta)
    processed_data.loc[interpolation.index, column] = interpolation
    print('done.')

temp.1 column is in processing...
31 days 00:00:00
done.
rh.1 column is in processing...
31 days 00:00:00
done.
s_temp.1 column is in processing...
31 days 00:00:00
done.
s_rh.1 column is in processing...
31 days 00:00:00
done.
ec.1 column is in processing...
31 days 00:00:00
done.
co2.1 column is in processing...
31 days 00:00:00
done.
atm.1 column is in processing...
31 days 00:00:00
done.
uv.1 column is in processing...
31 days 00:00:00
done.
rad.1 column is in processing...
31 days 00:00:00
done.
temp.2 column is in processing...
31 days 00:00:00
done.
rh.2 column is in processing...
31 days 00:00:00
done.
co2.2 column is in processing...
31 days 00:00:00
done.
atm.2 column is in processing...
31 days 00:00:00
done.
uv.2 column is in processing...
31 days 00:00:00
done.
rad.2 column is in processing...
31 days 00:00:00
done.
temp.3 column is in processing...
31 days 00:00:00
done.
rh.3 column is in processing...
31 days 00:00:00
done.
s_temp.3 column is in processing...
31 days 00:

In [18]:
processed_data.index = pd.DatetimeIndex(processed_data.index)
processed_data.to_csv('./results/forced_env.csv')