#### Cleaning Weather Dataset

In this notebook, we will clean the weather dataset.

In [1]:
# import libraries

# maths
import numpy as np
import pandas as pd

# others
import os
import re
import time
import datetime as datetime

### Functions

Similar to the train and test dataset, we split the data feature into year, month and day columns. Based on the first few rows of the data, it appears that there are some missing values in the dataset, labelled as M, - and T. Hence, we also created functions to summarise these 'null' values.

In [2]:
# split dates

def create_yr(x): 
    return x.split('-')[0] 

def create_mth(x): 
    return x.split('-')[1] 

def create_day(x): 
    return x.split('-')[2] 

def rename_columns (columns):
    return [column.lower() for column in columns]

def clean_date(df): 
    df.columns = rename_columns(df.columns)
    df['year'] = df.date.apply(create_yr)
    df['month'] = df.date.apply(create_mth)
    df['day'] = df.date.apply(create_day)
    df.drop('date', axis=1, inplace = True)
    return df

In [3]:
def count_t(x): 
    if x == '  T':
        return 1
    else:
        return 0
    
def count_m(x): 
    if x == 'M':
        return 1
    else:
        return 0

def count_dash(x): 
    if x == '-':
        return 1
    else:
        return 0

In [4]:
# count total number of M - T in df

def print_summary(df):    

    cols = ['column','M','-','T']
    df_summary = pd.DataFrame(columns=cols)
    idx = 0

    for col in df.columns:

        if df[col].dtype == 'object':

            total_m = df[col].apply(count_m).sum()
            total_dash = df[col].apply(count_dash).sum()
            total_t = df[col].apply(count_t).sum()

            df_summary.at[idx,cols[0]] = col
            df_summary.at[idx,cols[1]] = total_m
            df_summary.at[idx,cols[2]] = total_dash
            df_summary.at[idx,cols[3]] = total_t

        idx += 1
    
    return df_summary

#### Import WeatherDataset

In [5]:
weather = pd.read_csv('./datasets/weather.csv')

### Inspect Data

The weather dataset has 2944 rows and 22 features. Some of the columns' datatypes are listed as object as they contain missing values. We will need to rectify those later.

In [6]:
weather.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object

In [7]:
weather.isnull().sum()

Station        0
Date           0
Tmax           0
Tmin           0
Tavg           0
Depart         0
DewPoint       0
WetBulb        0
Heat           0
Cool           0
Sunrise        0
Sunset         0
CodeSum        0
Depth          0
Water1         0
SnowFall       0
PrecipTotal    0
StnPressure    0
SeaLevel       0
ResultSpeed    0
ResultDir      0
AvgSpeed       0
dtype: int64

In [8]:
# print first 5 records

print(weather.shape)
weather

(2944, 22)


Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.00,29.10,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.00,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.00,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.00,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.00,29.39,30.12,11.7,7,11.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,2,2014-10-29,49,40,45,M,34,42,20,0,...,,M,M,M,0.00,29.42,30.07,8.5,29,9.0
2940,1,2014-10-30,51,32,42,-4,34,40,23,0,...,,0,M,0.0,0.00,29.34,30.09,5.1,24,5.5
2941,2,2014-10-30,53,37,45,M,35,42,20,0,...,RA,M,M,M,T,29.41,30.10,5.9,23,6.5
2942,1,2014-10-31,47,33,40,-6,25,33,25,0,...,RA SN,0,M,0.1,0.03,29.49,30.20,22.6,34,22.9


In [9]:
# list all columns

print(weather.columns)

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth',
       'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')


In [10]:
#There are some unrelevant values happend in weather dataset - l.e. M, T

In [11]:
# weather summary

weather.describe()

Unnamed: 0,Station,Tmax,Tmin,DewPoint,ResultSpeed,ResultDir
count,2944.0,2944.0,2944.0,2944.0,2944.0,2944.0
mean,1.5,76.166101,57.810462,53.45788,6.960666,17.494905
std,0.500085,11.46197,10.381939,10.675181,3.587527,10.063609
min,1.0,41.0,29.0,22.0,0.1,1.0
25%,1.0,69.0,50.0,46.0,4.3,7.0
50%,1.5,78.0,59.0,54.0,6.4,19.0
75%,2.0,85.0,66.0,62.0,9.2,25.0
max,2.0,104.0,83.0,75.0,24.1,36.0


In [13]:
#weather = clean_date(weather)
weather

Unnamed: 0,station,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,...,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,day
0,1,83,50,67,14,51,56,0,2,0448,...,0.0,0.00,29.10,29.82,1.7,27,9.2,2007,05,01
1,2,84,52,68,M,51,57,0,3,-,...,M,0.00,29.18,29.82,2.7,25,9.6,2007,05,01
2,1,59,42,51,-3,42,47,14,0,0447,...,0.0,0.00,29.38,30.09,13.0,4,13.4,2007,05,02
3,2,60,43,52,M,42,47,13,0,-,...,M,0.00,29.44,30.08,13.3,2,13.4,2007,05,02
4,1,66,46,56,2,40,48,9,0,0446,...,0.0,0.00,29.39,30.12,11.7,7,11.9,2007,05,03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2939,2,49,40,45,M,34,42,20,0,-,...,M,0.00,29.42,30.07,8.5,29,9.0,2014,10,29
2940,1,51,32,42,-4,34,40,23,0,0622,...,0.0,0.00,29.34,30.09,5.1,24,5.5,2014,10,30
2941,2,53,37,45,M,35,42,20,0,-,...,M,T,29.41,30.10,5.9,23,6.5,2014,10,30
2942,1,47,33,40,-6,25,33,25,0,0623,...,0.1,0.03,29.49,30.20,22.6,34,22.9,2014,10,31


In [14]:
weather.head()

Unnamed: 0,station,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,...,snowfall,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,day
0,1,83,50,67,14,51,56,0,2,0448,...,0.0,0.0,29.1,29.82,1.7,27,9.2,2007,5,1
1,2,84,52,68,M,51,57,0,3,-,...,M,0.0,29.18,29.82,2.7,25,9.6,2007,5,1
2,1,59,42,51,-3,42,47,14,0,0447,...,0.0,0.0,29.38,30.09,13.0,4,13.4,2007,5,2
3,2,60,43,52,M,42,47,13,0,-,...,M,0.0,29.44,30.08,13.3,2,13.4,2007,5,2
4,1,66,46,56,2,40,48,9,0,0446,...,0.0,0.0,29.39,30.12,11.7,7,11.9,2007,5,3


### Data Cleaning Process

As indicated in the summary below, half of the entries (1472 out of 2944) in depart, sunrise and sunset columns have missing values. For sunrise and sunset columns, we did some desktop research and understand that this was because station 2 does not collect data for these columns. We have thus decided to impute these missing values with values from station 1.

For column water1, all of its entries are missing values M, hence we should drop this column. We have also decided to drop codesum, snowfall and depth columns as their entries are either 0 or missing.

There are T values in snowfall and precitotal. Based on the data documentation, this means that there are trace precipitate for that entry. Hence, we decided to round these values to 0. For the missing values in preciptotal, we decided to impute it with median values. Likewise for avgspeed, sealevel and stnpressure.

In [15]:
# count total number of M - T in df

print('before cleaning:')
df_summary = print_summary(weather)
df_summary

before cleaning:


Unnamed: 0,column,M,-,T
3,tavg,11,0,0
4,depart,1472,0,0
6,wetbulb,4,0,0
7,heat,11,0,0
8,cool,11,0,0
9,sunrise,0,1472,0
10,sunset,0,1472,0
11,codesum,0,0,0
12,depth,1472,0,0
13,water1,2944,0,0


In [17]:
weather.snowfall.unique()

array(['0.0', 'M', '  T', '0.1'], dtype=object)

In [18]:
weather.depth.unique()

array(['0', 'M'], dtype=object)

In [19]:
weather.drop(columns = ['codesum','water1','snowfall','depth'], inplace = True)

In [23]:
# For sunset/sunrise and depart with missing (M) values:
# its because station 2 does not collect these values. 
# so we can impute these missing values with values from station 1

i = 0
while i < weather.shape[0]:
    weather.iloc[i+1, 4] = weather.iloc[i, 4]
    weather.iloc[i+1, 9] = weather.iloc[i, 9]
    weather.iloc[i+1, 10] = weather.iloc[i, 10]
    i+=2

In [24]:
def impute_missing_tavg(row):
    if row['tavg'] == 'M': 
        row['tavg'] = (row['tmax'] - row['tmin']) * 0.5 + row['tmin']
    return row

weather = weather.apply(impute_missing_tavg, axis = 1)
weather.tavg = weather.tavg.astype('int64')

In [25]:
def impute_missing_wetbulb(row): 
    if row['wetbulb'] == 'M':
        row['wetbulb'] = row['tavg']-((row['tavg']-row['dewpoint'])/3)
    return row

weather = weather.apply(impute_missing_wetbulb, axis = 1)

In [26]:
def impute_missing_rest(row): 
    if row['heat'] == 'M':
        if row['tavg'] >= 65: 
            row['heat'] = 0
            row['cool'] = row['tavg'] - 65
        else: 
            row['heat'] = 65 - row['tavg']
            row['cool'] = 0

    if row['preciptotal'] == '  T':
        row['preciptotal'] = 0
    if row['preciptotal'] == 'M':
        row['preciptotal'] = weather[weather.preciptotal!='M'][weather.preciptotal!='  T'].preciptotal.median()       
    if row['stnpressure'] == 'M':
        row['stnpressure'] = weather[weather.stnpressure!='M'].stnpressure.median()
    if row['sealevel'] == 'M':
        row['sealevel'] = weather[weather.sealevel!='M'].sealevel.median()
    if row['avgspeed'] == 'M':
        row['avgspeed'] = weather[weather.avgspeed!='M'].avgspeed.median()    
    return row

weather = weather.apply(impute_missing_rest, axis = 1)

  row['preciptotal'] = weather[weather.preciptotal!='M'][weather.preciptotal!='  T'].preciptotal.median()


In [27]:
print('after cleaning:')    
df_summary = print_summary(weather)
df_summary

after cleaning:


Unnamed: 0,column,M,-,T
4,depart,0,0,0
6,wetbulb,0,0,0
7,heat,0,0,0
8,cool,0,0,0
9,sunrise,0,0,0
10,sunset,0,0,0
11,preciptotal,0,0,0
12,stnpressure,0,0,0
13,sealevel,0,0,0
16,avgspeed,0,0,0


### Output Data

We checked and confirmed that there are no more missing values before saving our processed dataset.

In [28]:
weather.head()

Unnamed: 0,station,tmax,tmin,tavg,depart,dewpoint,wetbulb,heat,cool,sunrise,sunset,preciptotal,stnpressure,sealevel,resultspeed,resultdir,avgspeed,year,month,day
0,1,83,50,67,14,51,56,0,2,448,1849,0.0,29.1,29.82,1.7,27,9.2,2007,5,1
1,2,84,52,68,14,51,57,0,3,448,1849,0.0,29.18,29.82,2.7,25,9.6,2007,5,1
2,1,59,42,51,-3,42,47,14,0,447,1850,0.0,29.38,30.09,13.0,4,13.4,2007,5,2
3,2,60,43,52,-3,42,47,13,0,447,1850,0.0,29.44,30.08,13.3,2,13.4,2007,5,2
4,1,66,46,56,2,40,48,9,0,446,1851,0.0,29.39,30.12,11.7,7,11.9,2007,5,3


In [30]:
# output clean data

weather.to_csv('./clean data/weather_clean.csv',index=False)