## Global Historical Climatology Network Dataset
Variables are stored in both rows and columns.
This dataset represents the daily weather records for a weather station (MX17004) in Mexico for five months in 2010.

In [1]:
import pandas as pd;
from datetime import datetime;

In [2]:
# Load the data
weather_raw = pd.read_csv('./data/weather-raw.csv');
weather_raw.head()

Unnamed: 0,id,year,month,element,d1,d2,d3,d4,d5,d6,...,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [3]:
# Change column names
days = [str(n) for n in range(1, 32)];
weather_raw.columns = list(weather_raw.columns[0:4]) + days;
weather_raw.head()

Unnamed: 0,id,year,month,element,1,2,3,4,5,6,...,22,23,24,25,26,27,28,29,30,31
0,MX17004,2010,1,tmax,,,,,,,...,,,,,,,,,27.8,
1,MX17004,2010,1,tmin,,,,,,,...,,,,,,,,,14.5,
2,MX17004,2010,2,tmax,,27.3,24.1,,,,...,,29.9,,,,,,,,
3,MX17004,2010,2,tmin,,14.4,14.4,,,,...,,10.7,,,,,,,,
4,MX17004,2010,3,tmax,,,,,32.1,,...,,,,,,,,,,


In [4]:
# Melt 1-31 columns into 2 columns (days, degrees)
weather_melt = pd.melt(weather_raw, id_vars=['id', 'year', 'month', 'element'], var_name='day', value_name="degrees");
weather_melt.head()

Unnamed: 0,id,year,month,element,day,degrees
0,MX17004,2010,1,tmax,1,
1,MX17004,2010,1,tmin,1,
2,MX17004,2010,2,tmax,1,
3,MX17004,2010,2,tmin,1,
4,MX17004,2010,3,tmax,1,


In [5]:
# Check NaN values
weather_melt.isnull().sum()

id           0
year         0
month        0
element      0
day          0
degrees    616
dtype: int64

In [6]:
# Drop NaN values as there is no temperature data in those rows
weather_melt.dropna(inplace=True);
weather_melt.reset_index(inplace=True, drop=True);
weather_melt.isnull().sum()

id         0
year       0
month      0
element    0
day        0
degrees    0
dtype: int64

In [7]:
# Check the column data types
weather_melt.dtypes

id          object
year         int64
month        int64
element     object
day         object
degrees    float64
dtype: object

In [8]:
# Transform columns 'year' and 'month' to strings in order to create a new datetime column with the date
weather_melt[['year', 'month']] = weather_melt[['year', 'month']].astype(str);

In [9]:
# Create new columns 'date' and drop columns 'year', 'month' and 'day'
weather_melt['date'] = weather_melt.apply(lambda row : datetime.strptime(f"{row['year']}-{row['month']}-{row['day']}", "%Y-%m-%d"), axis=1);
weather_melt.drop(['year',"month","day"], axis=1, inplace=True);
weather_melt.head()

Unnamed: 0,id,element,degrees,date
0,MX17004,tmax,29.9,2010-12-01
1,MX17004,tmin,13.8,2010-12-01
2,MX17004,tmax,27.3,2010-02-02
3,MX17004,tmin,14.4,2010-02-02
4,MX17004,tmax,31.3,2010-11-02


In [10]:
# Pivot the dataframe to turn the 'element' column into 2 columns (tmax, tmin)
weather_df = weather_melt.pivot_table(index=['id', 'date'], columns='element');
weather_df.reset_index(drop=False, inplace=True);
weather_df.columns = ['id','date','tmax','tmin'];

In [11]:
# Display the cleaned and sorted dataframe
weather_df.sort_values(by='date');
weather_df.head()

Unnamed: 0,id,date,tmax,tmin
0,MX17004,2010-01-30,27.8,14.5
1,MX17004,2010-02-02,27.3,14.4
2,MX17004,2010-02-03,24.1,14.4
3,MX17004,2010-02-11,29.7,13.4
4,MX17004,2010-02-23,29.9,10.7
