In [2]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import math
import scipy as sp
import os

import warnings
warnings.filterwarnings("ignore")

In [28]:
def load(file_name) -> pd.DataFrame:
    """Loading dataset
    @param file_name: user input dataset
    @return: dataset
    """
    if os.path.exists(file_name) == False:
        raise ValueError("File not found")
    dataset = pd.read_csv(file_name)
    return dataset

def copy_df(df) -> pd.DataFrame:
    """copy df to new temp df
    @param df: user input dataset
    @return: new df
    """
    return df.copy(deep=True)

def normalize_missing_values(dataset) -> pd.DataFrame:
    """normalize dataset missing values with mean for numerical columns
    @param dataset: user input dataset
    @return: normalized dataset
    """
    for col in dataset.columns:
        # print("column: ",col, "type: ", dataset[col].dtype)
        if dataset[col].dtype == 'object':
            dataset[col] = dataset[col].fillna(method='bfill')
        else:
            dataset[col] = dataset[col].fillna(dataset[col].mean()) 
    return dataset

# change column order to get label at the end
def change_column_order(data, first_column_name, second_column_name) -> pd.DataFrame:
    """change column order
    @param dataset: user input dataset
    @param first_column: first column name
    @param second_column: second column name
    @return: dataset with new column order
    """
    temp_df = copy_df(data)
    data.drop(first_column_name, axis=1, inplace=True)
    data.drop(second_column_name, axis=1, inplace=True)
    data = temp_df[[*data.columns, second_column_name, first_column_name]]
    return data

def print_dataset(dataset) -> None:
    """print dataset inforamtion
    @param dataset: user input dataset
    @return: print dataset information via head(), info(), shape()
    """
    print(dataset.head())
    print(dataset.info())
    print(dataset.describe())
    print(dataset.columns)
    print(dataset.iloc[:, -2].value_counts())

def dummy_variable_indicator(data, column_name) -> pd.DataFrame:
    """dummy variable indicator
    @param data: user input data
    @param column_name: column name
    @return: data with dummy variable indicator
    """
    if column_name not in data.columns:
        raise ValueError("Column not found")
    data = pd.get_dummies(data, columns=[column_name])
    return data

def remove_column(data, column_name) -> pd.DataFrame:
    """remove column
    @param data: user input data
    @param column_name: column name
    @return: data without column
    """
    if column_name not in data.columns:
        raise ValueError("Column not found")
    data = data.drop(column_name, axis=1)
    return data

In [29]:
dataset = load("housing.csv")
dataset = normalize_missing_values(dataset)
dataset = change_column_order(dataset, "median_house_value", "ocean_proximity")
print_dataset(dataset)

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income ocean_proximity  median_house_value  
0       322.0       126.0         8.3252        NEAR BAY            452600.0  
1      2401.0      1138.0         8.3014        NEAR BAY            358500.0  
2       496.0       177.0         7.2574        NEAR BAY            352100.0  
3       558.0       219.0         5.6431        NEAR BAY            341300.0  
4       565.0       259.0         3.8462        NEAR BAY            342200.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639

In [30]:
dataset = dummy_variable_indicator(dataset, "ocean_proximity")
data = remove_column(dataset, "ocean_proximity_INLAND")
print(data.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value  \
0       322.0       126.0         8.3252            452600.0   
1      2401.0      1138.0         8.3014            358500.0   
2       496.0       177.0         7.2574            352100.0   
3       558.0       219.0         5.6431            341300.0   
4       565.0       259.0         3.8462            342200.0   

   ocean_proximity_<1H OCEAN  ocean_proximity_ISLAND  \
0                          0                       0   
1                          0                       0

In [None]:
# split dataset into train, validation and test - 80% train, 10% validation and 10% test with sklearn

from sklearn.model_selection import StratifiedShuffleSplit
temp_data = copy_df(dataset)
X = temp_data.drop('median_house_value', axis=1)
y = temp_data[['median_house_value']]
feature_columns = list(X.columns)

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
# for train_index, test_valid_index in split.split(dataset, dataset.iloc[:, -1:]):
#     train_set = dataset.iloc[train_index]
#     test_valid_set = dataset.iloc[test_valid_index]

# split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
# for test_index, valid_index in split2.split(test_valid_set, test_valid_set.iloc[:, -1:]):
#     test_set = test_valid_set.iloc[test_index]
#     valid_set = test_valid_set.iloc[valid_index]

from sklearn.model_selection import train_test_split
temp_data = copy_df(dataset)
X = temp_data.drop('median_house_value', axis=1)
y = temp_data[['median_house_value']]
feature_columns = list(X.columns)

SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(X, y, test_size=.4, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

# print(x_train.head(), y_train.head())
# print(x_validation.head(), y_validation.head())

In [11]:
def train_test_split(data, train_size=None, test_size=None, random_state=None, shuffle=None) -> tuple:
    """split dataset into train, validation and test
    @param data: user input dataset
    @param train_size: user input train size
    @param test_size: user input test size
    @param random_state: user input random state
    @return: train, validation and test dataset
    """
    if train_size is None and test_size is None:
        raise ValueError("train_size and test_size can not be both None")
    if train_size is not None and test_size is not None and train_size + test_size > 1:
        raise ValueError("train_size and test_size sum must be equal to 1")
    if train_size is not None and test_size is not None and train_size + test_size < 1:
        raise ValueError("train_size and test_size sum is not equal to one")
    if train_size is not None:
        if train_size <= 0:
            raise ValueError("train_size must be greater than 0")
        if train_size >= 1:
            raise ValueError("train_size must be less than 1")
        if test_size is None:
            test_size = 1 - train_size
        elif test_size <= 0:
            raise ValueError("test_size must be greater than 0")
        elif test_size >= 1:
            raise ValueError("test_size must be less than 1")
    if test_size is not None:
        if test_size <= 0:
            raise ValueError("test_size must be greater than 0")
        if test_size >= 1:
            raise ValueError("test_size must be less than 1")
        if train_size is None:
            train_size = 1 - test_size
        elif train_size <= 0:
            raise ValueError("train_size must be greater than 0")
        elif train_size >= 1:
            raise ValueError("train_size must be less than 1")
    train_size = int(len(data) * train_size)
    test_size = int(len(data) * test_size)
    if shuffle is True:
        data = data.sample(frac=1).reset_index(drop=True)
    # print("train_size: ", train_size, "test_size: ", test_size, 'sum: ', train_size + test_size)
    train_index = np.random.choice(len(data), train_size, replace=False)
    x_train = data.drop('median_house_value', axis=1).iloc[train_index].reset_index(drop=True)
    y_train = data.iloc[train_index, -1:].reset_index(drop=True)
    test_index = np.setdiff1d(np.arange(len(data)), train_index)
    x_test = data.drop('median_house_value', axis=1).iloc[test_index].reset_index(drop=True)
    y_test = data.iloc[test_index, -1:].reset_index(drop=True)
    return x_train, y_train, x_test, y_test

def concat_data(x_data, y_data) -> list:
    """
    @param x_data: feature data
    @param y_data: label data
    @return: concatenation of features and label data
    """
    return pd.concat([x_data, y_data], axis=1)

def k_fold(data, k_fold_number):
    """ K-fold data using train_test_split function
    @param data: user input dataset
    @param k_fold_number: number of folds
    @return: train, validation and test dataset
    """
    x_train_list, x_test_list, y_train_list, y_test_list, x_validation_list, y_validation_list  = [], [], [], [], [], []
    for _ in range(k_fold_number):
        x_train,y_train, x_validation_test, y_validation_test = train_test_split(copy_df(data), train_size=.6, shuffle=True)
        x_validation, y_validation, x_test, y_test = train_test_split(concat_data(x_validation_test, y_validation_test), test_size=.5, shuffle=True)
        x_train_list.append(x_train)
        x_test_list.append(x_test)
        y_train_list.append(y_train)
        y_test_list.append(y_test)
        x_validation_list.append(x_validation)
        y_validation_list.append(y_validation)
    return x_train_list, y_train_list, x_test_list, y_test_list, x_validation_list, y_validation_list

# 60% train, 20% validation and 20% test
k_fold_number = 3
x_train_list, y_train_list, x_test_list, y_test_list, x_validation_list, y_validation_list = k_fold(copy_df(dataset), k_fold_number)

for index in range(k_fold_number):
    print("x_train_list[", index, "]: ", x_train_list[index].shape, "y_train_list[", index, "]: ", y_train_list[index].shape)
    print("x_validation_list[", index, "]: ", x_validation_list[index].shape, "y_validation_list[", index, "]: ", y_validation_list[index].shape)
    print("x_test_list[", index, "]: ", x_test_list[index].shape, "y_test_list[", index, "]: ", y_test_list[index].shape)
    print(f"sum: {x_train_list[index].shape[0] + x_validation_list[index].shape[0] + x_test_list[index].shape[0]}")

x_train_list[ 0 ]:  (12384, 9) y_train_list[ 0 ]:  (12384, 1)
x_validation_list[ 0 ]:  (4128, 9) y_validation_list[ 0 ]:  (4128, 1)
x_test_list[ 0 ]:  (4128, 9) y_test_list[ 0 ]:  (4128, 1)
sum: 20640
x_train_list[ 1 ]:  (12384, 9) y_train_list[ 1 ]:  (12384, 1)
x_validation_list[ 1 ]:  (4128, 9) y_validation_list[ 1 ]:  (4128, 1)
x_test_list[ 1 ]:  (4128, 9) y_test_list[ 1 ]:  (4128, 1)
sum: 20640
x_train_list[ 2 ]:  (12384, 9) y_train_list[ 2 ]:  (12384, 1)
x_validation_list[ 2 ]:  (4128, 9) y_validation_list[ 2 ]:  (4128, 1)
x_test_list[ 2 ]:  (4128, 9) y_test_list[ 2 ]:  (4128, 1)
sum: 20640


In [106]:
#print head()s for index 0
print("x_train_list[0]: ", x_train_list[0].head(), "\ny_train_list[0]: ", y_train_list[0].head())
print("x_validation_list[0]: ", x_validation_list[0].head(), "\ny_validation_list[0]: ", y_validation_list[0].head())
print("x_test_list[0]: ", x_test_list[0].head(), "\ny_test_list[0]: ", y_test_list[0].head())

x_train_list[0]:     longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -118.28     34.05                31.0       1525.0           730.0   
1    -118.10     33.84                36.0        690.0           109.0   
2    -117.90     34.12                33.0       1788.0           456.0   
3    -116.92     32.82                34.0       1765.0           284.0   
4    -117.87     33.63                 9.0       6163.0          1004.0   

   population  households  median_income ocean_proximity  
0      2510.0       652.0         1.6355       <1H OCEAN  
1       316.0       104.0         3.7813       <1H OCEAN  
2      1787.0       361.0         2.6629       <1H OCEAN  
3       772.0       282.0         5.0118       <1H OCEAN  
4      1912.0       903.0        10.8289       <1H OCEAN   
y_train_list[0]:     median_house_value
0            162500.0
1            209100.0
2            124100.0
3            165300.0
4            500001.0
x_validation_list[0]:     