# Intro

## Initial settings

In [1]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os,random, math, psutil, pickle 
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)
import pandas as pd
import seaborn as sns
import gc
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.patches as patches

sns.set()
%matplotlib inline

# to make this notebook's output stable across runs
np.random.seed(42)

from plotly import tools, subplots
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('max_columns', 150)

py.init_notebook_mode(connected=True)
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go

## Reducing Memory Size Function

In [2]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Loading data

### Mounting google drive

In [3]:
#from google.colab import drive
#drive.mount('/content/drive')

### Reading the pickle file

In [4]:
# for jupyter lab
weatherdata = pd.read_csv('weather20092019.csv')

# for google colab
#holidays = pd.read_csv('/content/drive/My Drive/phd project/UKholidays.csv')

In [5]:
weatherdata = reduce_mem_usage(weatherdata)

Mem. usage decreased to  2.40 Mb (31.7% reduction)



invalid value encountered in less


invalid value encountered in less



## Data description

In [6]:
weatherdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30709 entries, 0 to 30708
Data columns (total 23 columns):
Unnamed: 0           30709 non-null int16
ob_end_time          30709 non-null object
 id_type             30709 non-null object
 id                  30709 non-null int16
 ob_hour_count       30709 non-null int8
 version_num         30709 non-null int8
 met_domain_name     30709 non-null object
 src_id              30709 non-null int16
 rec_st_ind          30709 non-null int16
 max_air_temp        30709 non-null object
 min_air_temp        30709 non-null object
 min_grss_temp       30709 non-null object
 min_conc_temp       30709 non-null object
 max_air_temp_q      30709 non-null object
 min_air_temp_q      30709 non-null object
 min_grss_temp_q     30709 non-null object
 min_conc_temp_q     30709 non-null object
 meto_stmp_time      30709 non-null object
 midas_stmp_etime    30709 non-null object
 max_air_temp_j      30709 non-null object
 min_air_temp_j      30709 non-null obj

In [7]:
weatherdata.columns = weatherdata.columns.str.replace(' ', '')

In [8]:
drop_cols = ['Unnamed:0', 'min_grss_temp', 'min_conc_temp', 'min_grss_temp_q', 'min_conc_temp_q', 'meto_stmp_time', 'midas_stmp_etime', 'min_grss_temp_j', 'min_conc_temp_j', 'max_air_temp_q',
            'min_air_temp_q', 'max_air_temp_j', 'min_air_temp_j']
weatherdata = weatherdata.drop(columns=drop_cols)

In [9]:
weatherdata.columns

Index(['ob_end_time', 'id_type', 'id', 'ob_hour_count', 'version_num',
       'met_domain_name', 'src_id', 'rec_st_ind', 'max_air_temp',
       'min_air_temp'],
      dtype='object')

In [10]:
weatherdata['ob_end_time'] = pd.to_datetime(weatherdata['ob_end_time'])

convert_dict = {'id_type': str,
                'met_domain_name': str,
                'id': int,
                'ob_hour_count': int,
                'version_num': int,
                'src_id': int,
                'rec_st_ind': int,
               }

weatherdata = weatherdata.astype(convert_dict)

#weatherdata['max_air_temp'] = weatherdata['max_air_temp'].astype(float)
weatherdata['max_air_temp'] = pd.to_numeric(weatherdata['max_air_temp'], errors='coerce')
weatherdata['min_air_temp'] = pd.to_numeric(weatherdata['min_air_temp'], errors='coerce')


weatherdata.dtypes.value_counts()

int32             5
object            2
float64           2
datetime64[ns]    1
dtype: int64

In [11]:
is_132 =  weatherdata['src_id']==132
weatherdata = weatherdata[is_132]

In [12]:
drop_cols = ['id_type', 'id', 'ob_hour_count', 'version_num', 'met_domain_name', 'src_id', 'rec_st_ind']

weatherdata = weatherdata.drop(columns=drop_cols)



In [13]:
print('Size of train_df data', weatherdata.shape)

Size of train_df data (11095, 3)


In [14]:
weatherdata.describe()

Unnamed: 0,max_air_temp,min_air_temp
count,11090.0,11090.0
mean,11.208206,6.439423
std,5.482274,5.254589
min,-8.5,-16.0
25%,7.0,2.6
50%,11.4,6.6
75%,15.175,10.6
max,31.0,21.5


In [29]:
weatherdata.head()
#train.site_id.unique()

Unnamed: 0,ob_end_time,max_air_temp,min_air_temp,mean_temp
0,2009-01-01 09:00:00,0.4,-0.8,-0.2
1,2009-01-01 21:00:00,1.9,-0.5,0.7
2,2009-01-02 09:00:00,0.6,-3.8,-1.6
3,2009-01-02 21:00:00,1.7,-5.3,-1.8
4,2009-01-03 09:00:00,-1.1,-4.9,-3.0


In [27]:
# adding mean column
weatherdata['mean_temp'] = weatherdata.mean(axis=1)

weatherdata = weatherdata.reset_index()
weatherdata = weatherdata.drop('index', axis = 1)

### Saving the file

In [30]:
weatherdata.to_pickle('weatherdata.pkl')

del weatherdata

gc.collect()

134

In [31]:
# Loading the pickle file

weatherdata = pd.read_pickle('weatherdata.pkl')
weatherdata.head()

Unnamed: 0,ob_end_time,max_air_temp,min_air_temp,mean_temp
0,2009-01-01 09:00:00,0.4,-0.8,-0.2
1,2009-01-01 21:00:00,1.9,-0.5,0.7
2,2009-01-02 09:00:00,0.6,-3.8,-1.6
3,2009-01-02 21:00:00,1.7,-5.3,-1.8
4,2009-01-03 09:00:00,-1.1,-4.9,-3.0
