# 1. Setup Notebook

## 1.1. Import Libaries

In [5]:
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
%matplotlib inline

from datetime import datetime

from sklearn.model_selection import train_test_split

## 1.2. Import Data

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

## 1.3. Setup Function / Modules

In [6]:
def extract_string(column_input):
    if pd.isna(column_input):
        return None
    matches = re.findall(r'\d+\.\d+|\d+', column_input)
    if matches:
        return float(matches[0])
    else:
        return None
    
def remove_outliers_iqr(df, columns_to_check=None, factor=1.5):
    if columns_to_check is None:
        columns_to_check = df.columns

    Q1 = df[columns_to_check].quantile(0.25)
    Q3 = df[columns_to_check].quantile(0.75)
    IQR = Q3 - Q1

    mask = ~((df[columns_to_check] < (Q1 - factor * IQR)) | (df[columns_to_check] > (Q3 + factor * IQR))).any(axis=1)

    df_filtered = df[mask]

    return df_filtered

# 2. Data Pre-Processing

## 2.1. View Data

In [3]:
train.head(5)

Unnamed: 0,datetime,datetime_iso,time-zone,temp,visibility,d_point,feels,min_temp,max_temp,prssr,sea_level,grnd_level,hum,wind_spd,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds
0,283996800,1979-01-01 00:00:00+00:00,28800,24.75 Celcius,,23.89 C,25.76 C,24.28,25.22°C,1012,undetermined,,95,0.82,320.0 °,zero,0,,,100
1,284000400,1979-01-01 01:00:00+00:00,28800,24.58 C,,23.73 C,25.57 C,23.99 C,25.26 C,1012,,,95,0.96 m/s,338.0°,0,0,0.0,0.0,100
2,284004000,1979-01-01 02:00:00+00:00,28800,26.6 Celcius,unidentified,24.06 C,26.6 C,26.1 C,27.39,1012,,undetermined,86,1.22 m/s,339.0°,0,volume:zero,,,99
3,284007600,1979-01-01 03:00:00+00:00,28800,27.31 Celcius,,24.37 C,30.9 C,26.59,28.36 C,1012,,undetermined,84,1.08 m/s,342,0.13,nol,0.0,,94
4,284011200,1979-01-01 04:00:00+00:00,28800,27.41,,25.05 C,31.54 C,26.58 C,28.31 °C,1011,,undetermined,87,0.86,336.0°,0.34,nol,,0.0,100


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 341880 entries, 0 to 341879
Data columns (total 20 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   datetime      341880 non-null  int64 
 1   datetime_iso  341880 non-null  object
 2   time-zone     341880 non-null  int64 
 3   temp          341880 non-null  object
 4   visibility    51112 non-null   object
 5   d_point       341880 non-null  object
 6   feels         341880 non-null  object
 7   min_temp      341880 non-null  object
 8   max_temp      341880 non-null  object
 9   prssr         341880 non-null  object
 10  sea_level     192964 non-null  object
 11  grnd_level    192919 non-null  object
 12  hum           341880 non-null  object
 13  wind_spd      341880 non-null  object
 14  wind_deg      341880 non-null  object
 15  rain_1h       341880 non-null  object
 16  rain_3h       192329 non-null  object
 17  snow_1h       192696 non-null  object
 18  snow_3h       192699 non

## 2.2. Extract String

In [7]:
columns_to_extract = ['temp', 'min_temp', 'max_temp', 
                      'prssr', 'hum', 'wind_spd', 
                      'rain_1h', 'rain_3h', 'snow_1h', 
                      'snow_3h', 'clouds', 'd_point', 
                      'feels', 'wind_deg']

for column in columns_to_extract:
    train[column] = train[column].apply(lambda x: extract_string(x))


In [8]:
train.head()

Unnamed: 0,datetime,datetime_iso,time-zone,temp,visibility,d_point,feels,min_temp,max_temp,prssr,sea_level,grnd_level,hum,wind_spd,wind_deg,rain_1h,rain_3h,snow_1h,snow_3h,clouds
0,283996800,1979-01-01 00:00:00+00:00,28800,24.75,,23.89,25.76,24.28,25.22,1012.0,undetermined,,95.0,0.82,320.0,,0.0,,,100.0
1,284000400,1979-01-01 01:00:00+00:00,28800,24.58,,23.73,25.57,23.99,25.26,1012.0,,,95.0,0.96,338.0,0.0,0.0,0.0,0.0,100.0
2,284004000,1979-01-01 02:00:00+00:00,28800,26.6,unidentified,24.06,26.6,26.1,27.39,1012.0,,undetermined,86.0,1.22,339.0,0.0,,,,99.0
3,284007600,1979-01-01 03:00:00+00:00,28800,27.31,,24.37,30.9,26.59,28.36,1012.0,,undetermined,84.0,1.08,342.0,0.13,,0.0,,94.0
4,284011200,1979-01-01 04:00:00+00:00,28800,27.41,,25.05,31.54,26.58,28.31,1011.0,,undetermined,87.0,0.86,336.0,0.34,,,0.0,100.0


## 2.3. Drop Unnecessary Column