# Preprocess Weather data


In [24]:
# Imports
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import seaborn as sns
import glob
import os
import time
from zipfile import ZipFile
%matplotlib inline
from pandas.api.types import CategoricalDtype
import warnings
warnings.filterwarnings("ignore")


## Step 1: Import data

In [26]:
# extract rainfall data 
with ZipFile('../raw_data/rainfall-all-years.zip', 'r') as zipObj:
    zipObj.extractall('../raw_data/rainfall')
# extract solar data 
with ZipFile('../raw_data/solar-all-years.zip', 'r') as zipObj:
    zipObj.extractall('../raw_data/solar')
# extract rainfall data 
with ZipFile('../raw_data/temperature-all-years.zip', 'r') as zipObj:
    zipObj.extractall('../raw_data/temperature')

In [27]:
# import raw data 
rainfall_raw = pd.read_csv("../raw_data/rainfall/IDCJAC0009_086338_1800_Data.csv")
solar_raw = pd.read_csv("../raw_data/solar/IDCJAC0016_086338_1800_Data.csv")
temp_raw = pd.read_csv("../raw_data/temperature/IDCJAC0010_086338_1800_Data.csv")

## Step 2: Dimension Reduction 

In [28]:
start_date_2021 = '2021-01-01'
end_date_2021 = '2021-12-31'
start_date_2022 = '2022-01-01'
end_date_2022 = '2022-05-31'

def date_filter(df):
    # attribute: https://stacrainfall_rawkoverflow.com/questions/58072683/combine-year-month-and-day-in-python-to-create-a-date
    df['dateInt']=df['Year'].astype(str) + df['Month'].astype(str).str.zfill(2)+ df['Day'].astype(str).str.zfill(2)
    df['Date'] = pd.to_datetime(df['dateInt'], format='%Y%m%d')
    df.drop(['Year', 'Month', 'Day', 'dateInt'], axis=1, inplace=True)
    # find all 2021 data 
    mask_21 = (df['Date'] >= start_date_2021) & (df['Date'] <= end_date_2021)
    df_21 = df.loc[mask_21] 
    # find all 2022 data 
    mask_22 = (df['Date'] >= start_date_2022) & (df['Date'] <= end_date_2022)
    df_22 = df.loc[mask_22]
    return df_21, df_22

In [29]:
rain_21, rain_22 = date_filter(rainfall_raw)
solar_21, solar_22 = date_filter(solar_raw)
temp_21, temp_22 = date_filter(temp_raw)

## Step 3: Save on desk

In [30]:
rain_21.to_csv('../processed_data/rain_21.csv',header=True)
rain_22.to_csv('../processed_data/rain_22.csv',header=True)

solar_21.to_csv('../processed_data/solar_21.csv',header=True)
solar_22.to_csv('../processed_data/solar_22.csv',header=True)

temp_21.to_csv('../processed_data/temp_21.csv',header=True)
temp_22.to_csv('../processed_data/temp_22.csv',header=True)