### Goal:

Automate data cleaning for remaining four years of period data

In [6]:
import gzip
import os
import io
from pathlib import Path
import pandas as pd
import numpy as np
from datetime import datetime, timezone, timedelta

Cleaning portion involves
- selecting only UTC and period columns
- renaming those columns
- getting rid of half hour data

In [2]:
#write gzip file to text file
#gz = gzip file path name, a string
#txt = text file path name, a string

def unzipPeriod(gz, txt):

    #unzipping
    file = gz
    with gzip.open(file, 'rb') as ip:
            with io.TextIOWrapper(ip, encoding='utf-8') as decoder:
                # Let's read the content using read()
                content = decoder.read()
    with open(txt, 'w') as f:
        f.write(content)
    
    return

In [3]:
#cleaning tide data and creating csv
#txt = text file path name, a string
#csv = csv file path name, a string

def cleanPeriod(txt, csv):

    #reading as space delimited file
    df = pd.read_csv(txt, sep='\s+', header=None)
    
    #cleaning
    df=df.iloc[:,[0,2]].rename({0: 'UTC', 2: 'Period'}, axis='columns')

    #getting rid of 30 minute data bc only need hour granularity
    df = df.iloc[::2]
    
    #creating a csv
    df.to_csv(csv, index=False)
    
    return
    

In [8]:
root_folder = Path.cwd().parents[1]


In [5]:
unzipPeriod(root_folder/'data/raw/pcdip2015.gz',root_folder/'data/interim/period2015.txt')
unzipPeriod(root_folder/'data/raw/pcdip2016.gz',root_folder/'data/interim/period2016.txt')
unzipPeriod(root_folder/'data/raw/pcdip2017.gz',root_folder/'data/interim/period2017.txt')
unzipPeriod(root_folder/'data/raw/pcdip2018.gz',root_folder/'data/interim/period2018.txt')
unzipPeriod(root_folder/'data/raw/pcdip2019.gz',root_folder/'data/interim/period2019.txt')
unzipPeriod(root_folder/'data/raw/pcdip2020.gz',root_folder/'data/interim/period2020.txt')
unzipPeriod(root_folder/'data/raw/pcdip2021.gz',root_folder/'data/interim/period2021.txt')

### Had to manually delete top three lines of each txt file to avoid a ```read_csv``` parsing error

In [7]:
cleanPeriod(root_folder/'data/interim/period2015.txt',root_folder/'data/interim/00-period2015.csv')
cleanPeriod(root_folder/'data/interim/period2016.txt',root_folder/'data/interim/00-period2016.csv')
cleanPeriod(root_folder/'data/interim/period2017.txt',root_folder/'data/interim/00-period2017.csv')
cleanPeriod(root_folder/'data/interim/period2018.txt',root_folder/'data/interim/00-period2018.csv')
cleanPeriod(root_folder/'data/interim/period2019.txt',root_folder/'data/interim/00-period2019.csv')
cleanPeriod(root_folder/'data/interim/period2020.txt',root_folder/'data/interim/00-period2020.csv')
cleanPeriod(root_folder/'data/interim/period2021.txt',root_folder/'data/interim/00-period2021.csv')

Success!

## New Goal:
append all years together for one dataset

### csv files cannot store datetime objects, so need to convert the UTC column to a datetime object while reading the file to a dataframe. 

In [9]:
date_parser = pd.to_datetime

p15 = pd.read_csv(root_folder/'data/interim/00-period2015.csv', parse_dates=['UTC'], date_parser=date_parser)
p16 = pd.read_csv(root_folder/'data/interim/00-period2016.csv', parse_dates=['UTC'], date_parser=date_parser)
p17 = pd.read_csv(root_folder/'data/interim/00-period2017.csv', parse_dates=['UTC'], date_parser=date_parser)
p18 = pd.read_csv(root_folder/'data/interim/00-period2018.csv', parse_dates=['UTC'], date_parser=date_parser)
p19 = pd.read_csv(root_folder/'data/interim/00-period2019.csv', parse_dates=['UTC'], date_parser=date_parser)
p20 = pd.read_csv(root_folder/'data/interim/00-period2020.csv', parse_dates=['UTC'], date_parser=date_parser)
p21 = pd.read_csv(root_folder/'data/interim/00-period2021.csv', parse_dates=['UTC'], date_parser=date_parser)

In [10]:
bigp = pd.concat([p15,p16,p17,p18,p19,p20,p21], ignore_index=True)
bigp.shape

(59099, 2)

definitely missing some hours in April/May of 2016

805 unaccounted for hours

In [11]:
#checking to make sure there arn't duplicate dates

date_count = bigp['UTC'].value_counts().to_list()

ones = np.ones(len(date_count))

truth = date_count==ones
truth.sum()==bigp.shape[0]

True

In [12]:
bigp.to_csv(root_folder/'data/interim/00-period.csv',index=False)

## Half Hour to Top of Hour
Ran into some merging trouble, turns out it was from almost all of 2021 data being taken at the half hour instead of the top of the hour. 

In [15]:
thirty_min_indicies = bigp[bigp.UTC.dt.minute==30].index
twenty3_min_indicies = bigp[bigp.UTC.dt.minute==23].index


bigp.loc[bigp.UTC.dt.minute==30, 'UTC'] = bigp.iloc[thirty_min_indicies].UTC-timedelta(minutes=30)
bigp.loc[bigp.UTC.dt.minute==23, 'UTC'] = bigp.iloc[twenty3_min_indicies].UTC-timedelta(minutes=23)

bigp.UTC.dt.minute.value_counts()

0    59099
Name: UTC, dtype: int64

In [16]:
bigp.to_csv(root_folder/'data/interim/01-period.csv',index=False)