In [2]:
# importing necessary modules

import pandas as pd
import numpy as np

import os
for dirname, _, filenames in os.walk('dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

dataset\csv1.csv
dataset\csv2.csv


In [3]:
def merge_csv(path_to_csv1: str, path_to_csv2: str):
    data1 = pd.read_csv(path_to_csv1)
    data2 = pd.read_csv(path_to_csv2)

    # Union of columns
    columns = list(set(data1.columns | data2.columns))

    d = {col: data1[col] if col in data1.columns else data2[col] for col in columns}
    merged_data = pd.DataFrame(data=d)

    # Rearranging columns
    new_order = ['DY', 'MO', 'YEAR', 'LAT', 'LON']
    columns_list = merged_data.columns.tolist()
    for col_name in new_order:
        columns_list.remove(col_name)

    columns_list = new_order + columns_list
    merged_data = merged_data[columns_list]

    return merged_data

def organize_data(merged_data : pd.DataFrame):
    organized_data = merged_data.copy()
    # If data has KT (Insolation Clearness Index) column it must be converted to float
    if 'KT' in merged_data:
        # 'KT' column may have 2 different values that describes missing values. nan and -999.0
        organized_data['KT'] = organized_data['KT'].replace('               nan', '-999')
        organized_data['KT'] = organized_data['KT'].astype(float)


    # Value for missing model data cannot be computed or out of model availability range: -999.0
    # Replace these values to np.nan in order to use pre-defined functions effectively.
    organized_data.replace(-999.0, np.nan, inplace=True)

    return organized_data

In [5]:
df = organize_data(merge_csv("dataset\csv1.csv", "dataset\csv2.csv"))
df.head()

Unnamed: 0,DY,MO,YEAR,LAT,LON,WS50M_MAX,T2M_MAX,WS50M,PRECTOT,T2M_MIN,...,WS10M,TS,RH2M,WS50M_MIN,CLRSKY_SFC_SW_DWN,WS10M_MAX,T2MDEW,T2MWET,ALLSKY_SFC_SW_DWN,ALLSKY_SFC_LW_DWN
0,1,1,2015,39.75,32.75,6.27,4.72,3.52,10.26,-1.51,...,2.56,2.09,93.11,2.14,,3.57,0.83,0.84,1.55,7.06
1,2,1,2015,39.75,32.75,5.81,2.93,4.07,0.32,-4.29,...,2.85,-1.7,85.43,2.51,,4.11,-3.52,-3.51,1.21,6.97
2,3,1,2015,39.75,32.75,4.69,3.41,2.88,0.01,-4.57,...,2.08,-2.49,88.15,0.57,,3.94,-3.57,-3.56,2.05,6.45
3,4,1,2015,39.75,32.75,8.46,2.5,5.77,0.73,-4.27,...,4.16,-1.54,81.68,3.01,,5.63,-4.2,-4.18,2.34,6.46
4,5,1,2015,39.75,32.75,8.06,2.51,4.76,1.78,-3.45,...,3.46,-0.85,80.07,2.93,,6.46,-4.08,-4.06,1.42,6.81


In [9]:
df["PRECTOT_TODAY"] = (df["PRECTOT"] > 0).astype(int)
prectot_tomorrow = np.zeros(prectot_today.shape).astype(int)