## **1. Load Libraries**

In [1]:
import os
import pandas as pd
import src.util as utils

from tqdm import tqdm
from copy import deepcopy

from sklearn.model_selection import train_test_split

In [2]:
def read_raw_data(config_data: dict) -> pd.DataFrame:
    """Read the raw data frame

    Parameters
    ----------
    config : dict
        The loaded configuration file

    Return
    ------
    raw_dataset : pd.DataFrame
        the loaded dataframe in pandas DataFrame
    """

    # Define dataset directory
    raw_dataset_dir = config_data["raw_data_set_dir"]

    # Get the data in CSV
    for flname in tqdm(os.listdir(raw_dataset_dir)):
        if flname.endswith(".csv"):
            raw_dataset = pd.read_csv(raw_dataset_dir + flname)

    return raw_dataset

In [3]:
def basic_checking(data) -> None:
    """Checking Null Values and Duplicates

    data : pd.DataFrame
        The dataset we want to check
    """

    # Check Null Values
    n_null = data.isnull().any().sum()

    if n_null > 0:
        print("There are missing values, Please check it")
    else:
        print("There are no missing values")

    # Check Duplicates
    n_duplicates = data.duplicated().any().sum()

    if n_duplicates > 0:
        print("There are duplicates, Please check it")
    else:
        print("There are no duplicates")

In [4]:
def remove_features(features: list, data: pd.DataFrame) -> pd.DataFrame:
    """Removing various features on the dataset

    Parameters
    ----------
    features : list
        the list of feature that should be deleted
    data : pd.DataFrame
        The dataset of which the feature resides

    Return
    ------
    data : pd.DataFrame
        The dataset withouth the unnecessary features
    """

    if isinstance(features, list):
        data = data.drop(features, axis=1, inplace=True)
        return data
    else:
        fail_msg = "Please Enter a list"
        raise fail_msg

In [5]:
def reformat_feature(feature, data):
    """Procedure to reformat the data point in the dataset

    Currently the function is only supporting data formating
    in a sring date form: YY-mm-dd, and then get the date value

    Parameters
    ----------
    feature : str
        The name of feature we want to format
    data : pd.DataFrame
        The dataset of which the feature resides

    Return
    ------
    data : pd.DataFrame
        The dataset withouth the unnecessary features
    """

    # Get the day data from date
    temp_dict = {}
    for i in data[feature].unique().tolist():
        try:
            temp_dict[i] = int(i.split("-")[-1].lstrip("0"))
        except ValueError:
            print("Data is not integer")
            exit()

    data[feature].replace(temp_dict, inplace=True)

In [6]:
def validate_data(data, config_data):
    """
    """

    for feature in data:
        try:
            if feature == config_data["label"]:
                continue
            elif feature in config_data["categorical_columns"]:
                assert set(data[feature]).issubset(
                    set(config_data[f"range_{feature}"])), "error occurs"
            else:
                assert data[feature].between(
                    config_data[f"range_{feature}"][0],
                    config_data[f"range_{feature}"][1]) \
                    .sum() == len(data), "error occurs"
        except KeyError:
            print("Variable Has Been Dropped")

## **2. Load Configuration File**

In [7]:
config_data = utils.load_config()

## **3. Load Dataset**

In [8]:
raw_dataset = read_raw_data(config_data=config_data)
raw_dataset.head()

100%|████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 29.41it/s]


Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


## **4. Data Definition**

### **Dataset Description**
- **instant: record index**
- **dteday : date**
- season : season (1:winter, 2:spring, 3:summer, 4:fall)
- yr : year (0: 2011, 1:2012)
- mnth : month ( 1 to 12)
- hr : hour (0 to 23)
- holiday : weather day is holiday or not (extracted from [Web Link])
- weekday : day of the week
- workingday : if day is neither weekend nor holiday is 1, otherwise is 0.
- weathersit : 1-Clear; 2-Mist; 3-Light Snow; 4-Heavy Rain
- temp : Normalized temperature in Celsius.
- atemp: Normalized feeling temperature in Celsius.
- hum: Normalized humidity
- windspeed: Normalized wind speed
- **casual: count of casual users**
- **registered: count of registered users**
- cnt: count of total rental bikes including both casual and registered (target)

From the dataset description there are 3 important notes (bolded) in our dataset:

1. From the dataset description the "cnt" variable is the count of total rental bikes including both casual and registered. Hence, we could drop the casual and registered variable.

2. The instance variable is the recorded index of the dataset, meaning an order in which the data is generated. Hence, this table would be dropped.

3. The dteday variable format is in datetime however, we also have 'yr' (year) and 'mnt' (month). Hence, we want to validate whether these data correlate with the year and month to decide what should we do with the dataset.

## **5. Data Validation**

In [9]:
#Simple Checking
basic_checking(data=raw_dataset)

There are no missing values
There are no duplicates


### **5.1 Dropping Unnecessart Data: casual, registered and instant features**

- our target variable is 'cnt' which the total rental bikes including both casual and registered variable, hence the casual and registered columns can be dropped.
- As per dataset description the instant variable can be dropped

In [10]:
feature_list = config_data["unnecessary_predictors"]

In [11]:
remove_features(features=feature_list, data=raw_dataset)

### **5.2 Formatting 'dteday' variable**

As mentioned the dteday variable needs to be validate against 'yr' and 'mnth' variables

In [13]:
# Create copy dataframe of dteday, yr and mnt so it won't affect the original dataset
temp_df = raw_dataset[["dteday", "yr", "mnth"]]
temp_df.head(2)

Unnamed: 0,dteday,yr,mnth
0,2011-01-01,0,1
1,2011-01-01,0,1


In [14]:
#'yr' = 0 is for 2011, and 'yr' = 1 is for 2012
# mnth is for 1-12 month (January, February, ... December)

# Checking unique value for yr = 0 and month = 1
temp_df[(temp_df["yr"] == 0) & (temp_df["mnth"] == 1)][30:50]

Unnamed: 0,dteday,yr,mnth
30,2011-01-02,0,1
31,2011-01-02,0,1
32,2011-01-02,0,1
33,2011-01-02,0,1
34,2011-01-02,0,1
35,2011-01-02,0,1
36,2011-01-02,0,1
37,2011-01-02,0,1
38,2011-01-02,0,1
39,2011-01-02,0,1


In [15]:
# Checking unique value for yr = 0 and month = 2
temp_df[(temp_df["yr"] == 0) & (temp_df["mnth"] == 2)][30:50]

Unnamed: 0,dteday,yr,mnth
718,2011-02-02,0,2
719,2011-02-02,0,2
720,2011-02-02,0,2
721,2011-02-02,0,2
722,2011-02-02,0,2
723,2011-02-02,0,2
724,2011-02-02,0,2
725,2011-02-02,0,2
726,2011-02-02,0,2
727,2011-02-02,0,2


In [16]:
# Checking unique value for yr = 1 and month = 7
temp_df[(temp_df["yr"] == 1) & (temp_df["mnth"] == 7)][30:50]

Unnamed: 0,dteday,yr,mnth
13033,2012-07-02,1,7
13034,2012-07-02,1,7
13035,2012-07-02,1,7
13036,2012-07-02,1,7
13037,2012-07-02,1,7
13038,2012-07-02,1,7
13039,2012-07-02,1,7
13040,2012-07-02,1,7
13041,2012-07-02,1,7
13042,2012-07-02,1,7


In [17]:
# Checking unique value for yr = 1 and month = 12
temp_df[(temp_df["yr"] == 1) & (temp_df["mnth"] == 12)][30:50]

Unnamed: 0,dteday,yr,mnth
16667,2012-12-02,1,12
16668,2012-12-02,1,12
16669,2012-12-02,1,12
16670,2012-12-02,1,12
16671,2012-12-02,1,12
16672,2012-12-02,1,12
16673,2012-12-02,1,12
16674,2012-12-02,1,12
16675,2012-12-02,1,12
16676,2012-12-02,1,12


From our simulation it seems the 'yr' and 'mnth' are indicating the year and month in the dteday variable. Hence we can safely assume that the dteday varaible indicate days of when the data is taken. Hence we could format the dteday into integer from 1-31 indicating the days of when the data is recorded.

#### **Formatting dteday variable into day (1-31/30)**
If we take a look at the dteday data point, the entry shows **yyyy-mm-dd format**, hence, all we have to do **keep the dd** format and **drop the yyyy-mm-** data through string formatting

In [18]:
test_df = temp_df.copy()

In [19]:
reformat_feature(feature="dteday", data=raw_dataset)

In [20]:
# Verify Reformatting
raw_dataset["dteday"].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31],
      dtype=int64)

In [21]:
raw_dataset["dteday"].dtype

dtype('int64')

**Notes:** Data is successfully formatted into integer

In [22]:
# Fincal Checking for the dataset
raw_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17379 entries, 0 to 17378
Data columns (total 14 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   dteday      17379 non-null  int64  
 1   season      17379 non-null  int64  
 2   yr          17379 non-null  int64  
 3   mnth        17379 non-null  int64  
 4   hr          17379 non-null  int64  
 5   holiday     17379 non-null  int64  
 6   weekday     17379 non-null  int64  
 7   workingday  17379 non-null  int64  
 8   weathersit  17379 non-null  int64  
 9   temp        17379 non-null  float64
 10  atemp       17379 non-null  float64
 11  hum         17379 non-null  float64
 12  windspeed   17379 non-null  float64
 13  cnt         17379 non-null  int64  
dtypes: float64(4), int64(10)
memory usage: 1.9 MB


## **6. Validate Data**

In [23]:
validate_data(data=raw_dataset, config_data=config_data)

Variable Has Been Dropped
Variable Has Been Dropped


## **6. Data Splitting**

In [24]:
x = raw_dataset[config_data["predictors"]].copy()
y = raw_dataset[config_data["label"]].copy()

In [25]:
x.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,hum,atemp,windspeed
0,1,1,0,1,0,0,6,0,1,0.24,0.81,0.2879,0.0
1,1,1,0,1,1,0,6,0,1,0.22,0.8,0.2727,0.0
2,1,1,0,1,2,0,6,0,1,0.22,0.8,0.2727,0.0
3,1,1,0,1,3,0,6,0,1,0.24,0.75,0.2879,0.0
4,1,1,0,1,4,0,6,0,1,0.24,0.75,0.2879,0.0


In [26]:
y.head()

0    16
1    40
2    32
3    13
4     1
Name: cnt, dtype: int64

In [27]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.3, 
                                                    random_state=123)

In [28]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test, y_test,
                                                test_size=0.5,
                                                random_state=123)

In [29]:
# Dump train dataset to pickle
utils.pickle_dump(x_train, config_data["train_set_path"][0])
utils.pickle_dump(y_train, config_data["train_set_path"][1])

# Dump validation data
utils.pickle_dump(x_valid, config_data["valid_set_path"][0])
utils.pickle_dump(y_valid, config_data["valid_set_path"][1])

# Dump test data
utils.pickle_dump(x_test, config_data["test_set_path"][0])
utils.pickle_dump(y_test, config_data["test_set_path"][1])