# 00_Data exploration

In [2]:
import numpy as np
import pandas as pd
# Makes sure we see all columns
pd.set_option('display.max_columns', None)
from sklearn.model_selection import train_test_split

## Classification Problem

### Explore the dataset - Risk Factors for Cervical Cancer
#### Features
- Age: 年齡（歲）
- Number of sexual partners: 性伴侶數量
- First sexual intercourse: 首次性行為（歲）
- Num of pregnancies: 懷孕次數
- Smokes: 吸菸與否
- Smokes (years): 煙齡（年）
- Hormonal Contraceptive: 是否服用激素避孕藥
- Hormonal Contraceptives (years): 服用激素避孕藥的時間（年） 
- IUD: 是否有宮內節育器 
- IUD (years): 使用宮內節育器 (IUD) 的年數
- STDs: 是否患有性傳播疾病
- STDs (number): 患有幾個性傳播疾病
- STDs: Number of diagnosis: 性病診斷次數
- STDs: Time since first diagnosis: 第一次性病診斷後到現在的時間
- STDs: Time since last diagnosis: 上次性病診斷到現在的時間

#### Labels
- Biopsy: 健康與否

In [3]:
# 讀資料
cervical_data = pd.read_csv('data/cervical.csv')
cervical_data

Unnamed: 0,Age,Number.of.sexual.partners,First.sexual.intercourse,Num.of.pregnancies,Smokes,Smokes..years.,Hormonal.Contraceptives,Hormonal.Contraceptives..years.,IUD,IUD..years.,STDs,STDs..number.,STDs..Number.of.diagnosis,STDs..Time.since.first.diagnosis,STDs..Time.since.last.diagnosis,Biopsy
0,18,4,15,1,0,0.0,0,0.00,0,0.0,0,0,0,1,1,Healthy
1,15,1,14,1,0,0.0,0,0.00,0,0.0,0,0,0,1,1,Healthy
2,34,1,15,1,0,0.0,0,0.00,0,0.0,0,0,0,1,1,Healthy
3,52,5,16,4,1,37.0,1,3.00,0,0.0,0,0,0,1,1,Healthy
4,46,3,21,4,0,0.0,1,15.00,0,0.0,0,0,0,1,1,Healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,34,3,18,0,0,0.0,0,0.00,0,0.0,0,0,0,1,1,Healthy
854,32,2,19,1,0,0.0,1,8.00,0,0.0,0,0,0,1,1,Healthy
855,25,2,17,0,0,0.0,1,0.08,0,0.0,0,0,0,1,1,Healthy
856,33,2,24,2,0,0.0,1,0.08,0,0.0,0,0,0,1,1,Healthy


In [4]:
# 因為 STDs: Time since first diagnosis 和 STDs: Time since last diagnosis 缺少太多資料，因此刪除此兩列特徵
cervical_data.drop(['STDs..Time.since.first.diagnosis','STDs..Time.since.last.diagnosis'], inplace=True, axis=1)

In [5]:
# 查看是否有缺失資料
cervical_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 14 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Age                              858 non-null    int64  
 1   Number.of.sexual.partners        858 non-null    int64  
 2   First.sexual.intercourse         858 non-null    int64  
 3   Num.of.pregnancies               858 non-null    int64  
 4   Smokes                           858 non-null    int64  
 5   Smokes..years.                   858 non-null    float64
 6   Hormonal.Contraceptives          858 non-null    int64  
 7   Hormonal.Contraceptives..years.  858 non-null    float64
 8   IUD                              858 non-null    int64  
 9   IUD..years.                      858 non-null    float64
 10  STDs                             858 non-null    int64  
 11  STDs..number.                    858 non-null    int64  
 12  STDs..Number.of.diagno

In [6]:
# 將 lable mapping 到 0 和 1 並取出來
cervical_data['Biopsy'] = cervical_data['Biopsy'].map({'Healthy':0, 'Cancer':1})
y = cervical_data.pop('Biopsy')
X = cervical_data.copy()

In [7]:
# 分別列出數值變數名稱和類別變數名稱
num_cols = ['Age', 'Number.of.sexual.partners', 'First.sexual.intercourse', 'Num.of.pregnancies', 'Smokes..years.',
            'Hormonal.Contraceptives..years.', 'IUD..years.','STDs..number.', 'STDs..Number.of.diagnosis']
cat_cols = ['Smokes', 'Hormonal.Contraceptives', 'IUD', 'STDs']

In [8]:
# 進行 one hot encoding
X = pd.get_dummies(data=X, columns=cat_cols)
X

Unnamed: 0,Age,Number.of.sexual.partners,First.sexual.intercourse,Num.of.pregnancies,Smokes..years.,Hormonal.Contraceptives..years.,IUD..years.,STDs..number.,STDs..Number.of.diagnosis,Smokes_0,Smokes_1,Hormonal.Contraceptives_0,Hormonal.Contraceptives_1,IUD_0,IUD_1,STDs_0,STDs_1
0,18,4,15,1,0.0,0.00,0.0,0,0,1,0,1,0,1,0,1,0
1,15,1,14,1,0.0,0.00,0.0,0,0,1,0,1,0,1,0,1,0
2,34,1,15,1,0.0,0.00,0.0,0,0,1,0,1,0,1,0,1,0
3,52,5,16,4,37.0,3.00,0.0,0,0,0,1,0,1,1,0,1,0
4,46,3,21,4,0.0,15.00,0.0,0,0,1,0,0,1,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
853,34,3,18,0,0.0,0.00,0.0,0,0,1,0,1,0,1,0,1,0
854,32,2,19,1,0.0,8.00,0.0,0,0,1,0,0,1,1,0,1,0
855,25,2,17,0,0.0,0.08,0.0,0,0,1,0,0,1,1,0,1,0
856,33,2,24,2,0.0,0.08,0.0,0,0,1,0,0,1,1,0,1,0


In [9]:
class Cervical_DataLoader():
    def __init__(self):
        self.data = None

    def load_dataset(self, path="data/cervical.csv"):
        self.data = pd.read_csv(path)

    def preprocess_data(self):
        # 因為 STDs: Time since first diagnosis 和 STDs: Time since last diagnosis 缺少太多資料，因此刪除此兩列特徵
        self.data.drop(['STDs..Time.since.first.diagnosis','STDs..Time.since.last.diagnosis'], inplace=True, axis=1)
        
        # 將 lable mapping 到 0 和 1
        self.data['Biopsy'] = self.data['Biopsy'].map({'Healthy':0, 'Cancer':1})
        
        # 分別列出數值變數名稱和類別變數名稱
        num_cols = ['Age', 'Number.of.sexual.partners', 'First.sexual.intercourse', 'Num.of.pregnancies', 'Smokes..years.',
                    'Hormonal.Contraceptives..years.', 'IUD..years.','STDs..number.', 'STDs..Number.of.diagnosis']
        cat_cols = ['Smokes', 'Hormonal.Contraceptives', 'IUD', 'STDs']
        
        # 對類別變數進行 one hot encoding
        self.data = pd.get_dummies(data=self.data, columns=cat_cols)

    def get_data_split(self):
        y = self.data.pop('Biopsy')
        X = self.data.copy()
        return train_test_split(X, y, test_size=0.20, random_state=2022)

In [10]:
# Load data
data_loader = Cervical_DataLoader()
data_loader.load_dataset()
data = data_loader.data

# Show head
print(data.shape)
data.head()

# Show general statistics
data.info()

# Show preprocessed dataframe
data_loader.preprocess_data()
data_loader.data.head()

(858, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 858 entries, 0 to 857
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Age                               858 non-null    int64  
 1   Number.of.sexual.partners         858 non-null    int64  
 2   First.sexual.intercourse          858 non-null    int64  
 3   Num.of.pregnancies                858 non-null    int64  
 4   Smokes                            858 non-null    int64  
 5   Smokes..years.                    858 non-null    float64
 6   Hormonal.Contraceptives           858 non-null    int64  
 7   Hormonal.Contraceptives..years.   858 non-null    float64
 8   IUD                               858 non-null    int64  
 9   IUD..years.                       858 non-null    float64
 10  STDs                              858 non-null    int64  
 11  STDs..number.                     858 non-null    int64  
 12

Unnamed: 0,Age,Number.of.sexual.partners,First.sexual.intercourse,Num.of.pregnancies,Smokes..years.,Hormonal.Contraceptives..years.,IUD..years.,STDs..number.,STDs..Number.of.diagnosis,Biopsy,Smokes_0,Smokes_1,Hormonal.Contraceptives_0,Hormonal.Contraceptives_1,IUD_0,IUD_1,STDs_0,STDs_1
0,18,4,15,1,0.0,0.0,0.0,0,0,0,1,0,1,0,1,0,1,0
1,15,1,14,1,0.0,0.0,0.0,0,0,0,1,0,1,0,1,0,1,0
2,34,1,15,1,0.0,0.0,0.0,0,0,0,1,0,1,0,1,0,1,0
3,52,5,16,4,37.0,3.0,0.0,0,0,0,0,1,0,1,1,0,1,0
4,46,3,21,4,0.0,15.0,0.0,0,0,0,1,0,0,1,1,0,1,0


## Regression Problem

### Explore the dataset - Bike Rentals
#### Features
- season: 季節，包含春、夏、秋、冬
- yr: 年份，2011 年或 2012 年
- mnth: 月份，1~12 月
- holiday: 當天是否為假期
- weekday: 星期幾
- workingday: 當天是否為工作日
- weathersit: 那天的天氣情況是下面幾種狀況的其中一個
    - GOOD: 晴朗
    - MISTY: 有霧、有雲
    - RAIN/SNOW/STORM: 下雨、下雪、暴風
- temp: 溫度（攝氏度）
- hum: 相對濕度百分比（0~100%）
- windspeed: 風速(km/h)
- days_since_2011: 自 2011 年 1 月 1 日（數據集中的第一天）起的天數。引入此特性是為了考慮隨時間變化的趨勢。

#### Labels
- cnt: 自行車租賃的數量，也就是此回歸任務的預測目標

In [10]:
bike_data = pd.read_csv('data/bike.csv')
bike_data

Unnamed: 0,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,hum,windspeed,cnt,days_since_2011
0,WINTER,2011,JAN,NO HOLIDAY,SAT,NO WORKING DAY,MISTY,8.175849,80.5833,10.749882,985,0
1,WINTER,2011,JAN,NO HOLIDAY,SUN,NO WORKING DAY,MISTY,9.083466,69.6087,16.652113,801,1
2,WINTER,2011,JAN,NO HOLIDAY,MON,WORKING DAY,GOOD,1.229108,43.7273,16.636703,1349,2
3,WINTER,2011,JAN,NO HOLIDAY,TUE,WORKING DAY,GOOD,1.400000,59.0435,10.739832,1562,3
4,WINTER,2011,JAN,NO HOLIDAY,WED,WORKING DAY,GOOD,2.666979,43.6957,12.522300,1600,4
...,...,...,...,...,...,...,...,...,...,...,...,...
726,WINTER,2012,DEC,NO HOLIDAY,THU,WORKING DAY,MISTY,3.945849,65.2917,23.458911,2114,726
727,WINTER,2012,DEC,NO HOLIDAY,FRI,WORKING DAY,MISTY,3.906651,59.0000,10.416557,3095,727
728,WINTER,2012,DEC,NO HOLIDAY,SAT,NO WORKING DAY,MISTY,3.906651,75.2917,8.333661,1341,728
729,WINTER,2012,DEC,NO HOLIDAY,SUN,NO WORKING DAY,GOOD,4.024151,48.3333,23.500518,1796,729


In [11]:
# 查看是否有缺失資料
bike_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   season           731 non-null    object 
 1   yr               731 non-null    int64  
 2   mnth             731 non-null    object 
 3   holiday          731 non-null    object 
 4   weekday          731 non-null    object 
 5   workingday       731 non-null    object 
 6   weathersit       731 non-null    object 
 7   temp             731 non-null    float64
 8   hum              731 non-null    float64
 9   windspeed        731 non-null    float64
 10  cnt              731 non-null    int64  
 11  days_since_2011  731 non-null    int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 68.7+ KB


In [12]:
# 將 lable 取出來
y = bike_data.pop('cnt')
X = bike_data.copy()

In [13]:
# 分別列出數值變數名稱和類別變數名稱
num_cols = ['temp', 'hum', 'windspeed', 'days_since_2011']
cat_cols = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']

In [14]:
# 進行 one hot encoding
X = pd.get_dummies(data=X, columns=cat_cols)
X

Unnamed: 0,temp,hum,windspeed,days_since_2011,season_FALL,season_SPRING,season_SUMMER,season_WINTER,yr_2011,yr_2012,mnth_APR,mnth_AUG,mnth_DEC,mnth_FEB,mnth_JAN,mnth_JUL,mnth_JUN,mnth_MAR,mnth_MAY,mnth_NOV,mnth_OCT,mnth_SEP,holiday_HOLIDAY,holiday_NO HOLIDAY,weekday_FRI,weekday_MON,weekday_SAT,weekday_SUN,weekday_THU,weekday_TUE,weekday_WED,workingday_NO WORKING DAY,workingday_WORKING DAY,weathersit_GOOD,weathersit_MISTY,weathersit_RAIN/SNOW/STORM
0,8.175849,80.5833,10.749882,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0
1,9.083466,69.6087,16.652113,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
2,1.229108,43.7273,16.636703,2,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0
3,1.400000,59.0435,10.739832,3,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0
4,2.666979,43.6957,12.522300,4,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,3.945849,65.2917,23.458911,726,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,1,0
727,3.906651,59.0000,10.416557,727,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,1,0
728,3.906651,75.2917,8.333661,728,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0
729,4.024151,48.3333,23.500518,729,0,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,0,0


In [15]:
class Bike_DataLoader():
    def __init__(self):
        self.data = None

    def load_dataset(self, path="data/bike.csv"):
        self.data = pd.read_csv(path)

    def preprocess_data(self):
        # 分別列出數值變數名稱和類別變數名稱
        num_cols = ['temp', 'hum', 'windspeed', 'days_since_2011']
        cat_cols = ['season', 'yr', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit']
        
        # 對類別變數進行 one hot encoding
        self.data = pd.get_dummies(data=self.data, columns=cat_cols)

    def get_data_split(self):
        y = self.data.pop('cnt')
        X = self.data.copy()
        return train_test_split(X, y, test_size=0.20, random_state=2022)

In [16]:
# Load data
data_loader = Bike_DataLoader()
data_loader.load_dataset()
data = data_loader.data

# Show head
print(data.shape)
data.head()

# Show general statistics
data.info()

# Show preprocessed dataframe
data_loader.preprocess_data()
data_loader.data.head()

(731, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   season           731 non-null    object 
 1   yr               731 non-null    int64  
 2   mnth             731 non-null    object 
 3   holiday          731 non-null    object 
 4   weekday          731 non-null    object 
 5   workingday       731 non-null    object 
 6   weathersit       731 non-null    object 
 7   temp             731 non-null    float64
 8   hum              731 non-null    float64
 9   windspeed        731 non-null    float64
 10  cnt              731 non-null    int64  
 11  days_since_2011  731 non-null    int64  
dtypes: float64(3), int64(3), object(6)
memory usage: 68.7+ KB


Unnamed: 0,temp,hum,windspeed,cnt,days_since_2011,season_FALL,season_SPRING,season_SUMMER,season_WINTER,yr_2011,yr_2012,mnth_APR,mnth_AUG,mnth_DEC,mnth_FEB,mnth_JAN,mnth_JUL,mnth_JUN,mnth_MAR,mnth_MAY,mnth_NOV,mnth_OCT,mnth_SEP,holiday_HOLIDAY,holiday_NO HOLIDAY,weekday_FRI,weekday_MON,weekday_SAT,weekday_SUN,weekday_THU,weekday_TUE,weekday_WED,workingday_NO WORKING DAY,workingday_WORKING DAY,weathersit_GOOD,weathersit_MISTY,weathersit_RAIN/SNOW/STORM
0,8.175849,80.5833,10.749882,985,0,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,1,0
1,9.083466,69.6087,16.652113,801,1,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,1,0
2,1.229108,43.7273,16.636703,1349,2,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0
3,1.4,59.0435,10.739832,1562,3,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,1,0,0
4,2.666979,43.6957,12.5223,1600,4,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,0,0
