In [111]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import matplotlib.pyplot as plt

### <font color="orange">  **Imports, Data Loading and Preprocessing** </font>

In [139]:
df = pd.read_csv('health_insurance_train.csv')
df_autograder = pd.read_csv('health_insurance_autograde.csv')
pd.options.display.float_format = '{:.2f}'.format

# Define a function to convert education ranges to numerical values
def convert_education(education):

    education = education.replace('years', '')

    if '>16' in education:
        return 20
    
    if '<9' in education:
        return 4.5

    education = education.split('-')
    education = (float(education[-1])+float(education[0]))/2

    return education

# Define a function to convert yes/no values to binary values
def yes_no_True_False_to_binary(value):
    if value == 'yes' or value == 'no':
        if value == 'yes':
            return 1
        else:
            return -1
        
    if value == True or value == False:
        if value == True:
            return 1
        else:
            return 0

    return value

# Define a function to convert NaN values to the mean of the column
def nan_to_mean(value, mean):
    if pd.isna(value):
        return mean
    return value

# Define a function apply all conversion functions to the dataframe
def preprocess_data(df):
    # Apply the conversion functions to the education column
    df['education'] = df['education'].apply(convert_education)

    # Apply one-hot encoding to the race and region column

    ''''
    One hot encoding is a process by which categorical variables are converted into
    binary True/False. ML algorithm will read True as 1 and False as 0. So gradients
    can be made for each column separetely.

    '''
    df = pd.get_dummies(df, columns=['race'], prefix='race')
    df = pd.get_dummies(df, columns=['region'],prefix='reg')
    df = pd.get_dummies(df, columns=['hispanic'],prefix='hisp')


    # Apply binary conversion to all remaining columns that contain yes/no or True/False values
    for key in df.keys():
        df[key] = df[key].apply(yes_no_True_False_to_binary)

    # Apply the mean conversion to all columns that contain NaN values
    ''''
    Now that all columns are already converted to numerical values, we can apply the
    mean conversion to all columns that contain NaN values.

    '''

    for key in df:
        mean = df[key].mean()
        df[key] = df[key].apply(lambda x: nan_to_mean(x, mean))

    return df

df = preprocess_data(df)
df_autograder = preprocess_data(df_autograder)

X = df.iloc[:,1:]
Y = df.iloc[:,0]
X_autog = df_autograder

mean = X.mean()
std = X.std()
print(std)

def specific_normalization(X, mean, std, columns_index = [3,4,5,7]):
    for i in range(len(X.columns)):
        if i in columns_index:
            X.iloc[:,i] = (X.iloc[:,i] - mean[i])/std[i]
    return X

X = specific_normalization(X, mean, std, columns_index = [3,4,5,7])

print(X)

hhi                 1.00
whi                 0.97
hhi2                0.98
education           3.13
experience         11.56
kidslt6             0.62
kids618             0.94
husby              23.73
race_black          0.22
race_other          0.09
race_white          0.37
reg_northcentral    0.43
reg_other           0.42
reg_south           0.46
reg_west            0.41
hisp_no             0.26
hisp_yes            0.26
dtype: float64


AttributeError: 'list' object has no attribute 'any'

In [120]:
########### Pipeline Configuration ##############

filter_outliers = True
scale_data = True


#Pipeline 1 --> True,False
#Pipeline 2 --> True,True
#################################################




if filter_outliers:
    from scipy.stats import chi2

    X_numeric = X.select_dtypes(include=[np.number]) # Select only numerical columns

    # Calculate the mean vector and covariance matrix
    mean_vector = X_numeric.mean(axis=0)
    cov_matrix = np.cov(X_numeric.values.T)

    #--------- Compute the Mahalanobis distance for each observation
    def mahalanobis_distance(row, mean_vector, cov_matrix):

        '''
        The Mahalanobis distance assumes that the data follows a multivariate normal distribution. 

        1. Calculate the difference vector (diff) between the observation and the mean.
        2. Transpose the difference vector (diff.T).
        3. Multiply the transposed difference vector by the inverse covariance matrix (diff.T @ inv_cov_matrix).
        4. Multiply the result by the original difference vector (diff.T @ inv_cov_matrix @ diff).
        5. Take the square root of the result to get the Mahalanobis distance (md).
        '''

        diff = row - mean_vector
        inv_cov_matrix = np.linalg.inv(cov_matrix)
        md = np.sqrt(diff.T @ inv_cov_matrix @ diff) 
        return md

    # Apply the Mahalanobis distance function to each row
    # lamda makes row is a anonymous function that takes a single argument row, it is necessary to pass row as an argument to .apply
    # What apply sees is a function that takes a single argument, so it passes each row to this function
    X['mahalanobis'] = X_numeric.apply(lambda row: mahalanobis_distance(row, mean_vector, cov_matrix), axis=1) 

    # Determine the threshold for identifying outliers
    '''
    chi-square distribution (also chi-squared or χ2-distribution) is usually used
    coupled with Mahalanobis. This is because Mahalanobis is inherently a sum
    of squares of diff values and the covariance matrix. If each k column of the data
    is independly and follows a normal distribution, then the Mahalanobis distance follows
    a chi-square with k -1 degrees of freedom. The -1 is because the mean vector is calculated from
    the data, so it is automaticaly dependent.

    '''
    # Set the percentile for the threshold
    percentile = 98
    threshold = np.percentile(X['mahalanobis'], percentile)
    print(f"Threshold (at {percentile}th percentile): {threshold}")

    # Plot the distribution of Mahalanobis distances
    plt.hist(X['mahalanobis'], bins=30, edgecolor='k', alpha=0.7, density=True, label='Mahalanobis Distance')

    # Add the threshold line
    plt.axvline(threshold, color='r', linestyle='dashed', linewidth=1, label=f'Threshold (at {percentile}th percentile)')

    # Add titles and labels
    plt.title('Distribution of Mahalanobis Distances')
    plt.xlabel('Mahalanobis Distance')
    plt.ylabel('Density')
    plt.legend()
    plt.show()

    # Identify outliers
    outliers = X[X['mahalanobis'] > threshold]
    
    if not outliers.empty:
        print("Outliers found")
        print(f"Number of outliers: {len(outliers)}")
    else:
        print("No outliers found")
    
    # Filter out the rows that contain outliers
    X = X[X['mahalanobis'] <= threshold]

    # Drop the 'mahalanobis' column as it's no longer needed
    X.drop(columns=['mahalanobis'], inplace=True)




       hhi   whi  hhi2  education  experience  kidslt6  kids618  husby  \
0    -0.99  1.28  0.80       0.28       -0.51    -0.55     0.31  -0.22   
1    -0.99  1.28  0.80       0.28       -1.63     1.07    -0.75  -0.52   
2     1.01 -0.78  0.80       0.92       -0.16    -0.55     0.31   3.06   
3    -0.99 -0.78  0.80       0.28       -0.08     0.00    -0.00   1.38   
4    -0.99  1.28 -1.25      -0.36       -0.68    -0.55     1.37  -1.15   
...    ...   ...   ...        ...         ...      ...      ...    ...   
4995  1.01 -0.78  0.80      -0.36        1.05    -0.55    -0.75  -1.15   
4996  1.01 -0.78  0.80       0.28       -1.29     2.70    -0.75   3.06   
4997 -0.99  1.28 -1.25       0.28       -1.03     1.07    -0.75  -0.23   
4998  1.01 -0.78  0.80       0.92       -1.03    -0.55     1.37   0.03   
4999 -0.99  1.28 -1.25       0.28        1.39    -0.55    -0.75  -1.15   

      race_black  race_other  race_white  reg_northcentral  reg_other  \
0          -0.23       -0.09        0.

1      -0.99
2       1.01
3      -0.99
4      -0.99
        ... 
4995    1.01
4996    1.01
4997   -0.99
4998    1.01
4999   -0.99
Name: hhi, Length: 5000, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X.iloc[:,:] = (X - mean) / std
1       1.28
2      -0.78
3      -0.78
4       1.28
        ... 
4995   -0.78
4996   -0.78
4997    1.28
4998   -0.78
4999    1.28
Name: whi, Length: 5000, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X.iloc[:,:] = (X - mean) / std
1       0.80
2       0.80
3       0.80
4      -1.25
        ... 
4995    0.80
4996    0.80
4997   -1.25
4998    0.80
4999   -1.25
Name: hhi2, Length: 5000, dtype: float64' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  X.iloc[:,:] = (X - mean) / std
1      -0.23
2      -0.23
3      -0.23
4      -0.23
        ... 
4995   -0.23
4996   -0.23
4997   -0.23
4998   -0.23
4999   -0

LinAlgError: Singular matrix