# Getting data (read csv), clean and import into DB/export to csv

### Step 01
In order to **get data** and store it into a Pandas dataframe the following code needs to be provided with:
* `file` to be imported
* `room` name of the room, e.g. `BA` for bath (Bad)
* `val` value to be imported, e.g. `Temp` for temperature

Data **cleaning** is done via:
* removing duplicates (first occurrence is kept)
* filling gaps by method 'fill forward'

If the dataframe shall be saved to QuestDB, also the following parameters are important: \
`step_db` to be set to `true`

> Communication with QuestDB with (preset) parameters: \
> `HOST` and \
> `PORT` to be able to communicate with the db

Else the cleaned dataframe is **saved to a csv**.

In [12]:
from questdb.ingress import Sender, IngressError
import sys
import csv
import pandas as pd
import numpy as np
from datetime import date

# global variable for dataframe head or tail method
g_len_list = 10

# logic for importing wind or temperatures
class Import:
    def __init__(self):
        step_db = False           #'False' do not import into QuestDB (first CAPITAL letter!!!)
        self.process(step_db) #call function
        
    def process(self, step_db):
        global g_len_list
        # 01 import data from csv out of Loxone
        #def read():
        data = csv.DictReader(open(
            'S:/Projekt_Traumhaus/09_Programmierung/Loxone/_Statistiken/csv-Exporte aus Loxone/tw201611-20230916/\
5_0_171 - 170_Bad.csv'
            ), delimiter=";")
        df = pd.DataFrame(data)

        ROOM = 'BA'
        VAL = 'Temp'

        #create columns to import  ## creating new column at the end: df.loc[:, "End"] = ...
        if 'Datum' in df.columns:
            bool = True
            print (f'> Column "Datum" or "Zeit" in csv found, applying appropriate pre-import steps')
            df['UTC'] = pd.to_datetime(df['Datum'] + ' ' + df['Zeit'])
            df[ROOM+'_'+VAL] = pd.to_numeric(df.iloc[:, 2])  #pd.to_numeric(df['Temp bei Tür'])
        else:
            bool = False
            print (f'> Column "Datum" or "Zeit" in csv NOT found, applying appropriate pre-import steps')
            df['UTC'] = pd.to_datetime(df.iloc[:, 0])  #, utc=False, dtype='datetime64[ns, UTC-05:00]','
            df[ROOM+'_'+VAL] = pd.to_numeric(df.iloc[:, 1])
        
        '''
        #df2 = df.drop(columns={'UTC'})
        #newdf = df.rename(columns={'BA - Temperatur': 'BATemp'})
        #df['BA_Temp2'] = df['BA - Temperatur']
        '''
        
        print (f'> {len(df.index)} lines into DF imported')

        # 02 check for duplicates, list and clean them
        ids = df["UTC"]
        print(df[ids.isin(ids[ids.duplicated()])].sort_values("UTC").tail(g_len_list))
        
        # drop duplicate lines (if any)
        df = df.drop_duplicates(subset=['UTC']) #important: as same timestamp might have different values we search for duplicates only in UTC (and take the first row as the 'right' one) 
        ##print (df.head(5))
        if (bool):
            df.iloc[:, [1,2,3,0,4]] #changing column order
            df_short = df.iloc[:, 3:5] #only use the new columns  ##iloc: all the rows, only columns 3 to x
        else:
            ##df.iloc[:, [1,2,0,3]]
            ##print (df.head(5))
            df_short = df.iloc[:, [0, 3]]
        print (f'> {len(df_short.index)} lines in DF ok')
        
        print (df_short.head(5))
        #df_short.dtypes
        #print(df_short.info())
        
        # 03 add data to the 10mins timestamp list
        df_norm = self.time_normalize(df_short)
        
        # 04 fill gaps and create flag column
        fl = ROOM+'_'+VAL+'_FLAG'
        df_norm[fl] = np.where(df_norm[ROOM+'_'+VAL].isna(), 500, 100)  #100 = original and ok, 500 is automatically added
        df_norm[fl] = pd.to_numeric(df_norm[fl])
        
        df_fill = self.gaps_fill(df_norm)
        
        # 05 if selected, send to DB - else export to csv
        if (step_db):
            self.db_import(df_fill) #call function
        else:
            self.csv_export(df_fill, ROOM, VAL) #call function
            
    def fill_gap_in_col(self, col, method):
        """
        Fills true gap in series.
        """
        colf = col.copy()
        first_idx = colf.first_valid_index()
        last_idx = colf.last_valid_index()
        if (method == 'ffill'):
            colf.loc[first_idx:last_idx] = colf.loc[first_idx:last_idx].ffill()
        else
            colf.loc[first_idx:last_idx] = colf.loc[first_idx:last_idx].bfill()
        return colf
    
    def gaps_fill(self, obj):
        """ 
        Function to fill gaps in df
        @param self: class
        @param obj: dataframe
        @return: dataframe without NaN
        """
        col = obj #.iloc[:, 1]
        met = 'ffill'
        df = self.fill_gap_in_col(col, met) # call function
        
        # print result
        global g_len_list
        #g_len_list = g_len_list*18
        #pd.options.display.max_rows = g_len_list
        #pd.options.display.max_seq_items = 20
        print (df.tail(g_len_list))
        print (f'> "NaN" gaps filled with method \'{met}\'')
        return df
    
    def time_normalize(self, obj):        
        """ 
        Function to map data of a df to a pre-specified list of timestamps
        @param self: class
        @param obj: dataframe
        @return: dataframe merged on timestamps
        """
        # read UTC:
        data = csv.DictReader(open(
            'C:/Users/andre/Nextcloud/WS_2023/IKT/10_DataRaw/UTC_every10mins_hc.csv'), 
            delimiter=";")
        df = pd.DataFrame(data)
        ####### take care whether timestamp is UTC-defined or not!
        df['UTC'] = pd.to_datetime(df['UTC']) #,utc=True)
        print (f'> {len(df.index)} lines in UTC')
        
        # merge on UTC:
        df_norm = pd.merge(df, obj, on='UTC', how='left')  #pd.merge(df_row, df3, left_on='id', right_on='id') if column names not the same
        
        global g_len_list
        print (df_norm.tail(g_len_list))
        print (f'>{len(df_norm.index)} lines time-normalized via \'left join\'')
        return df_norm
     
    def csv_export(self, obj, ROOM, VAL):
        filename = 'C:/Users/andre/Nextcloud/WS_2023/IKT/20_Data/' + str(ROOM) + '_' + str(VAL) + '_cleaned_and_filled_at_' + str(date.today()) + '.csv'
        obj.to_csv(filename, sep=',', index=False, encoding='utf-8')
        print(f'> Export to: \'' + filename + '\' successful')
   
    def db_import(self, obj):
        """ 
        Function to save df into table of QuestDB
        @param self: class
        @param obj: dataframe
        """
        print (f'> Trying to import {len(obj.index)} lines into QuestDB')
        HOST = 'localhost'
        PORT = 9009

        try:
            with Sender(HOST, PORT) as sender:
                sender.dataframe(
                        obj,
                        table_name=ROOM,  # Table name to insert into.  e.g. 'BA8'
                        #symbols=['GW_Wind'],  # Columns to be inserted as SYMBOL types.
                        at='UTC')  # Column containing the designated timestamps.
            print (f'> Import of {ROOM}_{VAL}_cleaned_and_filled into DB successful')
        except IngressError as e:
            sys.stderr.write(f'>Got error: {e}\n')
    
if __name__ == "__main__":
    Import()  #call Import class and there it directly jumps to __init__

> Column "Datum" or "Zeit" in csv found, applying appropriate pre-import steps
> 176160 lines into DF imported
           Datum      Zeit Temp bei Tür        UTC  BA_Temp
169505  5/1/2023  00:00:00       22.242 2023-05-01   22.242
169506  5/1/2023  00:00:00       22.242 2023-05-01   22.242
170993  6/1/2023  00:00:00       23.100 2023-06-01   23.100
170994  6/1/2023  00:00:00       23.100 2023-06-01   23.100
172433  7/1/2023  00:00:00       23.700 2023-07-01   23.700
172434  7/1/2023  00:00:00       23.700 2023-07-01   23.700
173922  8/1/2023  00:00:00       23.700 2023-08-01   23.700
173923  8/1/2023  00:00:00       23.700 2023-08-01   23.700
175404  9/1/2023  00:00:00       23.400 2023-09-01   23.400
175405  9/1/2023  00:00:00       23.400 2023-09-01   23.400
> 176067 lines in DF ok
                  UTC  BA_Temp
0 2016-11-14 15:10:00     21.5
1 2016-11-14 15:20:00     21.5
2 2016-11-14 15:30:00     21.5
3 2016-11-14 15:40:00     21.5
4 2016-11-14 15:50:00     21.5
> 359580 lines in U

In [7]:
import pandas as pd
#another easy way to create time stamps
index = pd.date_range("2016-11-14 15:10:00", periods=359580, freq="10T")  # create time stamps with 10mins, is already DatetimeIndex
index

DatetimeIndex(['2016-11-14 15:10:00', '2016-11-14 15:20:00',
               '2016-11-14 15:30:00', '2016-11-14 15:40:00',
               '2016-11-14 15:50:00', '2016-11-14 16:00:00',
               '2016-11-14 16:10:00', '2016-11-14 16:20:00',
               '2016-11-14 16:30:00', '2016-11-14 16:40:00',
               ...
               '2023-09-16 15:30:00', '2023-09-16 15:40:00',
               '2023-09-16 15:50:00', '2023-09-16 16:00:00',
               '2023-09-16 16:10:00', '2023-09-16 16:20:00',
               '2023-09-16 16:30:00', '2023-09-16 16:40:00',
               '2023-09-16 16:50:00', '2023-09-16 17:00:00'],
              dtype='datetime64[ns]', length=359580, freq='10T')