In [2]:
import gzip
import os
import io
import pandas as pd

In [7]:
file = 'data/pcdip2017.gz'
txt = 'data/period2017.txt'

with gzip.open(file, 'rb') as ip:
        with io.TextIOWrapper(ip, encoding='utf-8') as decoder:
            # Let's read the content using read()
            content = decoder.read()
with open(txt, 'w') as f:
    f.write(content)

In [3]:
txt = 'data/period2017.txt'
#reading as space delimited file
df = pd.read_csv(txt, sep='\s+', header=None)


There was some trouble parsing the original .txt file automatically created, I think because there were some dashes or some other characters the parser couldn't identify. So I manually got rid of the header rows, will have to refer to the data preview online to restore the column names.

In [4]:
df=df.iloc[:,[0,2]]

In [5]:
df.head()

Unnamed: 0,0,2
0,201701010000,11
1,201701010030,11
2,201701010100,11
3,201701010130,11
4,201701010200,11


In [6]:
df = df.rename({0: 'UTC', 2: 'Period'}, axis='columns')

In [8]:
df.to_csv('data/period2017.csv', index=False)

### Cleaned all five years of period data, now will look to make sure the number of entries is correct....since comes from the same buoy as the swell data, likely that June of 2019 will also have some missing entries, but we shall see

In [12]:
p17 = pd.read_csv('data/period2017.csv')
p18 = pd.read_csv('data/period2018.csv')
p19 = pd.read_csv('data/period2019.csv')
p20 = pd.read_csv('data/period2020.csv')
p21 = pd.read_csv('data/period2021.csv')

In [13]:
p17.shape

(8760, 2)

In [14]:
p18.shape

(8759, 2)

In [15]:
p19.shape

(8704, 2)

In [16]:
p20.shape

(8784, 2)

In [17]:
p21.shape

(8760, 2)

In [20]:
p18.head()

Unnamed: 0,UTC,Period
0,201801010000,11
1,201801010100,11
2,201801010200,11
3,201801010300,9
4,201801010400,11


Gonna mess around with ```datetime```, try and convert the UTC column, thennnnn check out what's going on in 2019/18. Eventual goal is to get comfy with ```datetime``` and convert all of my time information to this type

In [22]:
import datetime

In [24]:
print(datetime.datetime.now())

2022-11-15 09:29:35.339574


In [25]:
p18['UTC'].dtype

dtype('int64')

Since the UTC column here is just int64, first need to convert it to a string in order to splice, then could put it in format that ```datetime``` understands. See [explaination](https://stackoverflow.com/questions/52730806/how-to-convert-utc-timestamp-string-to-pandas-datetime)

In [31]:
p18['UTC'] = p18['UTC'].astype('string')

p18['UTC'] = pd.to_datetime(p18['UTC'], utc=True)

p18['UTC'].dtype

In [40]:
p18.head()

Unnamed: 0,UTC,Period
0,2018-01-01 00:00:00+00:00,11
1,2018-01-01 01:00:00+00:00,11
2,2018-01-01 02:00:00+00:00,11
3,2018-01-01 03:00:00+00:00,9
4,2018-01-01 04:00:00+00:00,11


Whoooo date time conversion actually so easy

In [48]:
p18 = p18.set_index('UTC')

This is so that I can use ```.loc``` to slice dataframe by date

In [62]:
p18.loc['2018-10'].shape

(743, 1)

October is missing an hour!

In [67]:
p18.loc['2018-10-4'].shape

(23, 1)

In [68]:
p18.loc['2018-10-4']

Unnamed: 0_level_0,Period
UTC,Unnamed: 1_level_1
2018-10-04 00:00:00+00:00,4
2018-10-04 01:00:00+00:00,11
2018-10-04 02:00:00+00:00,11
2018-10-04 03:00:00+00:00,11
2018-10-04 04:00:00+00:00,11
2018-10-04 05:00:00+00:00,11
2018-10-04 06:00:00+00:00,11
2018-10-04 07:00:00+00:00,9
2018-10-04 08:00:00+00:00,11
2018-10-04 09:00:00+00:00,9


No data for 7pm october 4th

In [70]:
p18.to_csv('data/p18.csv')

## New Goal:
append all years together for one dataset

### csv files cannot store datetime objects, so need to convert the UTC column to a datetime object while reading the file to a dataframe. 

In [88]:
date_parser = pd.to_datetime

p17 = pd.read_csv('data/period2017.csv', parse_dates=['UTC'], date_parser=date_parser)
p18 = pd.read_csv('data/period2018.csv', parse_dates=['UTC'], date_parser=date_parser)
p19 = pd.read_csv('data/period2019.csv', parse_dates=['UTC'], date_parser=date_parser)
p20 = pd.read_csv('data/period2020.csv', parse_dates=['UTC'], date_parser=date_parser)
p21 = pd.read_csv('data/period2021.csv', parse_dates=['UTC'], date_parser=date_parser)

In [91]:
bigp = pd.concat([p17,p18,p19,p20,p21], ignore_index=True)
bigp.shape

(43767, 2)

definitely missing some hours but should be alright

In [79]:
24*365*5+24

43824

In [93]:
bigp['UTC'].value_counts()

2020-09-21 18:30:00+00:00    1
2018-12-17 02:00:00+00:00    1
2018-09-19 17:00:00+00:00    1
2021-12-14 07:00:00+00:00    1
2021-10-30 21:00:00+00:00    1
                            ..
2017-09-01 20:00:00+00:00    1
2020-04-01 13:30:00+00:00    1
2019-09-10 13:00:00+00:00    1
2017-05-29 13:00:00+00:00    1
2019-01-15 12:00:00+00:00    1
Name: UTC, Length: 43767, dtype: int64