In [19]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
import math
import scipy as sp
import os

import warnings
warnings.filterwarnings("ignore")

In [59]:
def load(file_name) -> pd.DataFrame:
    """Loading dataset
    @param file_name: user input dataset
    @return: dataset
    """
    if os.path.exists(file_name) == False:
        raise ValueError("File not found")
    dataset = pd.read_csv(file_name)
    return dataset

def copy_df(df) -> pd.DataFrame:
    """copy df to new temp df
    @param df: user input dataset
    @return: new df
    """
    return df.copy(deep=True)

def normalize_missing_values(dataset) -> pd.DataFrame:
    """normalize dataset missing values with mean for numerical columns
    @param dataset: user input dataset
    @return: normalized dataset
    """
    for col in dataset.columns:
        # print("column: ",col, "type: ", dataset[col].dtype)
        if dataset[col].dtype == 'object':
            dataset[col] = dataset[col].fillna(method='bfill')
        else:
            dataset[col] = dataset[col].fillna(dataset[col].mean()) 
    return dataset

# change column order to get label at the end
def change_column_order(data, first_column_name, second_column_name) -> pd.DataFrame:
    """change column order
    @param dataset: user input dataset
    @param first_column: first column name
    @param second_column: second column name
    @return: dataset with new column order
    """
    temp_df = copy_df(data)
    data.drop(first_column_name, axis=1, inplace=True)
    data.drop(second_column_name, axis=1, inplace=True)
    data = temp_df[[*data.columns, second_column_name, first_column_name]]
    return data

def print_dataset(dataset) -> None:
    """print dataset inforamtion
    @param dataset: user input dataset
    @return: print dataset information via head(), info(), shape()
    """
    print(dataset.head())
    print(dataset.info())
    print(dataset.describe())
    print(dataset.columns)
    print(dataset.iloc[:, -1:].value_counts())

In [64]:
dataset = load("housing.csv")
dataset = normalize_missing_values(dataset)
dataset = change_column_order(dataset, "median_house_value", "ocean_proximity")
print_dataset(dataset)

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income ocean_proximity  median_house_value  
0       322.0       126.0         8.3252        NEAR BAY            452600.0  
1      2401.0      1138.0         8.3014        NEAR BAY            358500.0  
2       496.0       177.0         7.2574        NEAR BAY            352100.0  
3       558.0       219.0         5.6431        NEAR BAY            341300.0  
4       565.0       259.0         3.8462        NEAR BAY            342200.0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639

In [None]:
# split dataset into train, validation and test - 80% train, 10% validation and 10% test with sklearn

from sklearn.model_selection import StratifiedShuffleSplit
temp_data = copy_df(dataset)
X = temp_data.drop('median_house_value', axis=1)
y = temp_data[['median_house_value']]
feature_columns = list(X.columns)

# split = StratifiedShuffleSplit(n_splits=1, test_size=0.4, random_state=42)
# for train_index, test_valid_index in split.split(dataset, dataset.iloc[:, -1:]):
#     train_set = dataset.iloc[train_index]
#     test_valid_set = dataset.iloc[test_valid_index]

# split2 = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=42)
# for test_index, valid_index in split2.split(test_valid_set, test_valid_set.iloc[:, -1:]):
#     test_set = test_valid_set.iloc[test_index]
#     valid_set = test_valid_set.iloc[valid_index]

from sklearn.model_selection import train_test_split
temp_data = copy_df(dataset)
X = temp_data.drop('median_house_value', axis=1)
y = temp_data[['median_house_value']]
feature_columns = list(X.columns)

SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(X, y, test_size=.4, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

# print(x_train.head(), y_train.head())
# print(x_validation.head(), y_validation.head())

In [98]:
# 60% train, 20% validation and 20% test

def train_test_split(data, train_size=None, test_size=None, random_state=None, shuffle=None) -> tuple:
    """split dataset into train, validation and test
    @param data: user input dataset
    @param train_size: user input train size
    @param test_size: user input test size
    @param random_state: user input random state
    @return: train, validation and test dataset
    """
    if train_size is None and test_size is None:
        raise ValueError("train_size and test_size can not be both None")
    if train_size is not None and test_size is not None and train_size + test_size > 1:
        raise ValueError("train_size and test_size sum must be equal to 1")
    if train_size is not None and test_size is not None and train_size + test_size < 1:
        raise ValueError("train_size and test_size sum is not equal to one")
    if train_size is not None:
        if train_size <= 0:
            raise ValueError("train_size must be greater than 0")
        if train_size >= 1:
            raise ValueError("train_size must be less than 1")
        if test_size is None:
            test_size = 1 - train_size
        elif test_size <= 0:
            raise ValueError("test_size must be greater than 0")
        elif test_size >= 1:
            raise ValueError("test_size must be less than 1")
    if test_size is not None:
        if test_size <= 0:
            raise ValueError("test_size must be greater than 0")
        if test_size >= 1:
            raise ValueError("test_size must be less than 1")
        if train_size is None:
            train_size = 1 - test_size
        elif train_size <= 0:
            raise ValueError("train_size must be greater than 0")
        elif train_size >= 1:
            raise ValueError("train_size must be less than 1")
    train_size = int(len(data) * train_size)
    test_size = int(len(data) * test_size)
    if shuffle is True:
        data = data.sample(frac=1).reset_index(drop=True)
    # print("train_size: ", train_size, "test_size: ", test_size, 'sum: ', train_size + test_size)
    train_index = np.random.choice(len(data), train_size, replace=False)
    x_train = data.drop('median_house_value', axis=1).iloc[train_index].reset_index(drop=True)
    y_train = data.iloc[train_index, -1:].reset_index(drop=True)
    test_index = np.setdiff1d(np.arange(len(data)), train_index)
    x_test = data.drop('median_house_value', axis=1).iloc[test_index].reset_index(drop=True)
    y_test = data.iloc[test_index, -1:].reset_index(drop=True)
    return x_train,y_train, x_test, y_test

def k_fold_with_train_test_split(data, k_fold_number):
    """ k-fold with train test split
    @param data: user input dataset
    @param k_fold_number: number of folds
    @return: train, validation and test dataset
    """
    temp_data = copy_df(data)
    x_train, x_test, y_train, y_test, x = [], [], [], []
    for fold in range(k_fold_number):

        x_train,y_train, x_test, y_test = train_test_split(dataset, train_size=.6, shuffle=True)

        # test_indexes = np.random.choice(temp_data.index, test_size, replace=False)
        # x_test.append(temp_data.iloc[test_indexes,0:-1].reset_index(drop=True))
        # y_test.append(temp_data.iloc[test_indexes, -1:].reset_index(drop=True))
        # train_df = temp_data.drop(test_indexes)
        # x_train.append(train_df.iloc[:, 0:-1].reset_index(drop=True))
        # y_train.append(train_df.iloc[:, -1:].reset_index(drop=True))
        pass

    # return x_train, x_test, y_train, y_test

train_size:  12384 test_size:  8256 sum:  20640
   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     38.11                47.0       3007.0           524.0   
1    -119.84     36.83                17.0       3012.0           408.0   
2    -118.12     34.06                25.0       1526.0           388.0   
3    -121.26     37.97                41.0       2398.0           448.0   
4    -122.37     37.93                45.0       3150.0           756.0   

   population  households  median_income ocean_proximity  
0      1152.0       486.0         4.0000        NEAR BAY  
1       987.0       362.0         7.4201          INLAND  
2      1304.0       378.0         3.1892       <1H OCEAN  
3      1143.0       444.0         3.0352          INLAND  
4      1798.0       749.0         1.7500        NEAR BAY      median_house_value
0            141500.0
1            229700.0
2            214700.0
3             69800.0
4             37900.0


In [67]:
x_train, x_test, y_train, y_test = k_fold_train_test_split(dataset, 5)

for index, value in enumerate(x_test):
    print(f" x_test[{index}] : {value}\ny_test[{index}] : {value}")

 x_test[0] :       longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
4128    -122.46     37.78                52.0       3088.0           727.0   
4129    -117.56     34.12                 4.0       5351.0          1210.0   
4130    -119.81     36.83                19.0       6789.0          1200.0   
4131    -118.30     34.17                37.0        350.0           115.0   
4132    -117.40     33.90                32.0       1263.0           178.0   
...         ...       ...                 ...          ...             ...   
8251    -122.02     37.53                21.0       4280.0           673.0   
8252    -122.81     38.08                19.0       1615.0           366.0   
8253    -121.21     38.65                14.0       3443.0           510.0   
8254    -121.91     36.62                52.0       1220.0           267.0   
8255    -121.46     38.51                32.0       2437.0           592.0   

      population  households  median_income ocean_