# Preliminary Exploratory Data Analysis 2: Weather Data

In [144]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Preprocessing

### Load Data

In [145]:
stations = pd.read_csv('data/ghcnd-stations.csv')

In [146]:
stations[(stations['LOC_ID'] == 'CA') & (stations['NAME'].str.match('LOS ANGELES'))]

Unnamed: 0,ID,LATITUDE,LONGITUDE,VAL1,LOC_ID,NAME,TYPE,VAL2
56601,US1CALA0030,34.1712,-118.4353,205.7,CA,LOS ANGELES 4.3 NNW,,
56621,US1CALA0064,34.0809,-118.2729,123.4,CA,LOS ANGELES 2.6 NW,,
96356,USC00045111,34.05,-118.25,125.0,CA,LOS ANGELES 6TH MAIN,,
96357,USC00045112,34.0667,-118.2333,85.0,CA,LOS ANGELES TERMINAL A,,
120337,USW00023174,33.9381,-118.3889,29.6,CA,LOS ANGELES INTL AP,,72295.0
120927,USW00093134,34.0511,-118.2353,70.1,CA,LOS ANGELES DWTN USC CAMPUS,,


Best Single Weather Station ID:

|ID	|LATITUDE	|LONGITUDE	|VAL1	|LOC_ID	|NAME	|TYPE	|VAL2
|----	|--------	|--------	|----	|-------	|------	|------	|----
|USW00093134	|34.0511	|-118.2353	|70.1	|CA	|LOS ANGELES DWTN USC CAMPUS      |    |		


In [147]:
weather = pd.read_csv('data/USW00093134.csv', engine='python')

In [148]:
def get_date(row):
    string = str(row['DATE'])
    year = string[:4]
    month = string[4:6]
    day = string[6:]
    date = year + '-' + month + '-' + day
    return date

weather['S_DATE'] = weather.apply(get_date, axis=1)
weather['DATE'] = pd.to_datetime(weather['S_DATE'])
weather =  weather[['DATE', 'ELEMENT', 'DATA_VALUE']]

In [149]:
weather['ELEMENT'].unique()

array(['TMAX', 'TMIN', 'PRCP', 'WT16', 'WT14', 'SNOW', 'SNWD', 'WT05',
       'WT08', 'WT18', 'WT01', 'WT07', 'WT03', 'TOBS', 'WSFG', 'WT04',
       'PGTM', 'WDFG', 'ACSH', 'PSUN', 'TSUN', 'WDFM', 'WSFM', 'WESD',
       'TAVG', 'AWND', 'FMTM', 'WDF2', 'WDF5', 'WSF2', 'WSF5', 'WT13',
       'WT02', 'WT19', 'ADPT', 'ASLP', 'ASTP', 'AWBT', 'RHAV', 'RHMN',
       'RHMX', 'WT09'], dtype=object)

In [150]:
mask = (weather['ELEMENT'] == 'TMAX') | (weather['ELEMENT'] == 'TMIN') | (weather['ELEMENT'] == 'PRCP')
pivot = weather[mask].pivot_table(index='DATE', columns='ELEMENT', values='DATA_VALUE')
pivot.dropna(inplace=True)
pivot.reset_index(inplace=True)

In [151]:
print(pivot.head())

ELEMENT       DATE   PRCP   TMAX   TMIN
0       1906-04-04   76.0  178.0  111.0
1       1906-04-05  152.0  189.0  100.0
2       1906-04-23   13.0  189.0  128.0
3       1906-04-27   64.0  161.0  100.0
4       1906-05-25  114.0  161.0  128.0


In [152]:
pivot.to_csv('data/la_weather.csv', index=False)

In [153]:
weather = pd.read_csv('data/la_weather.csv')

In [154]:
weather

Unnamed: 0,DATE,PRCP,TMAX,TMIN
0,1906-04-04,76.0,178.0,111.0
1,1906-04-05,152.0,189.0,100.0
2,1906-04-23,13.0,189.0,128.0
3,1906-04-27,64.0,161.0,100.0
4,1906-05-25,114.0,161.0,128.0
...,...,...,...,...
38999,2022-08-08,0.0,317.0,194.0
39000,2022-08-09,0.0,333.0,217.0
39001,2022-08-10,0.0,333.0,206.0
39002,2022-08-11,0.0,311.0,194.0


In [157]:
mask = weather['DATE'] >= '2000-01-01'

In [159]:
weather = weather[mask]

In [None]:
plt.figure(figsize=(12,8))
for col in ['TMAX', 'TMIN', 'PRCP']:
    plt.plot(weather['DATE'], weather[col], label=col)
    plt.title('Weather in Los Angeles: {}'.format(col))
    plt.legend()
    plt.show()