# Pandas unstack and merge

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import mplleaflet


In [None]:
!find .. | grep -i fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89

In [None]:
file = '../_data/fb441e62df2d58994928907a91895ec62c2c42e6cd075c2700843b89.csv'

### Import data

In [None]:
df_GHCN = pd.read_csv(file)
df_GHCN.sample(5)

### Convert long DF to wide DF; TMIN and TMAX to columns

In [None]:
df = df_GHCN.groupby(['Element', 'ID', 'Date']).agg({'Data_Value':max, 'Data_Value':min})
df_unstack = df.unstack(level=0).reset_index()
df_unstack.info()

In [None]:
df_unstack.columns = 'ID Date TMAX TMIN'.split(' ')

In [None]:
df_unstack.info()
df_unstack.sample(5)

### Method 2

#### Create separate dataframes for each Element

In [None]:
df_max = df_GHCN[df_GHCN.loc[:, 'Element'] == 'TMAX']
df_min = df_GHCN[df_GHCN.loc[:, 'Element'] == 'TMIN']

#### Merge separate dataframes

In [None]:
df_merge = pd.merge(df_min, df_max, how='outer', left_on=['ID','Date'], right_on=['ID','Date'])
df_merge.columns = ['ID', 'Date', 'x', 'TMIN', 'y', 'TMAX']
df_merge.drop(['x', 'y'], axis=1, inplace=True)
df_merge.sample(10)

### Check similarity

In [None]:
df_unstack.shape == df_merge.shape
df_unstack.columns in df_merge.columns.values
np.mean(sum(df_unstack['ID'] == 'USW00014833') == sum(df_merge['ID'] == 'USW00014833'))

In [None]:
df_GHCN = df_unstack

### Convert to datetime

In [None]:
df_GHCN.loc[:, 'Date'] = pd.to_datetime(df_GHCN.loc[:, 'Date'])

In [None]:
df_GHCN['Day_of_year'] = df_GHCN.loc[:, 'Date'].dt.dayofyear

### Remove leap days

In [None]:
leap = (df_GHCN.loc[:, 'Date'].dt.day == 29) & (df_GHCN.loc[:, 'Date'].dt.month == 2)
df_GHCN = df_GHCN[~leap]

### Split years

In [None]:
df_2005 = df_GHCN[df_GHCN.loc[:, 'Date'].dt.year < 2015]
df_2015 = df_GHCN[df_GHCN.loc[:, 'Date'].dt.year == 2015]

In [None]:
df_2005.head(5)

### Daily records (< 2015)

In [None]:
daily_records = df_2005.groupby(['Day_of_year']).agg({'TMIN':min, 'TMAX':max})
daily_records.head(5)

### Merge daily records with 2015

In [None]:
df_2005 = pd.merge(df_2015, daily_records, how='left', left_on='Day_of_year', right_index=True)
df_2005.columns = ['ID', 'Date', 'TMAX', 'TMIN', 'Day_of_year','Record Min', 'Record Max']
df_2005.head()