# Nagyadat 2022 hf kiindulás

## Initialziation

### Imports
Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as sc
import io
import requests
import warnings
from sklearn.linear_model import LinearRegression

### Data
Load the dataset from a repository

In [2]:
other_train_url = "https://raw.githubusercontent.com/ZachLevente/Nagyadat2022/main/data/other_train.csv"
prsnl_train_url = "https://raw.githubusercontent.com/ZachLevente/Nagyadat2022/main/data/personal_train.csv"
other_valid_url = "https://raw.githubusercontent.com/ZachLevente/Nagyadat2022/main/data/other_valid.csv"
prsnl_valid_url = "https://raw.githubusercontent.com/ZachLevente/Nagyadat2022/main/data/personal_valid.csv"

db1 = requests.get(other_train_url).content
db2 = requests.get(prsnl_train_url).content
db3 = requests.get(other_valid_url).content
db4 = requests.get(prsnl_valid_url).content

personal_train = pd.read_csv(io.StringIO(db1.decode('utf-8')))
other_train    = pd.read_csv(io.StringIO(db2.decode('utf-8')))
personal_valid = pd.read_csv(io.StringIO(db3.decode('utf-8')))
other_valid    = pd.read_csv(io.StringIO(db4.decode('utf-8')))

## Preprocessing

### Sanitization
We have to normalize incoming data, given it's incosistent nature. This can mean missing values from certain entries, diffenet notation for the same concept or simply a too wide value set.
In this segment, we define functions that can be later used for transforming data into a consistent, more easily usable form.

#### General value types


In [3]:
def sanitize_boolean(boolean):
    try:
        if boolean.strip() in ['f','F','FALSE','false','False']:
            return 0
        elif boolean.strip() in ['t','T','TRUE','true','True']:
            return 1
        else:
            return np.nan
    except AttributeError:
        return np.nan

def sanitize_number(number):
    try:
        sanitized = int(pd.to_numeric(number, errors="coerce"))
        return sanitized if sanitized > 0 else np.nan
    except AttributeError:
        return np.nan
    except ValueError:
        return np.nan

def sanitize_string(string):
    try:
        string = string.strip()
        if string in ['None', 'nan', '??', '?']:
            return np.nan
        return string
    except AttributeError:
        return np.nan
    
def sanitize_date(date):
    date = str(date).replace('/', '-')
    date = date[:10]
    date = date.split("-")
    
    if date[0] != 'nan':
        if len(date[0]) != 4:
            if len(date[2]) == 2 and int(date[0]) > 31:
                new_date = "19"+ date[0] +"-"+date[1]+"-"+date[2] 
                
            elif ((len(date[2]) == 2) and (int(date[0]) < 31) and (int(date[2]) > 31)):
                new_date = "19"+date[2]+"-"+date[1]+"-"+date[0] 
                
            elif ((len(date[2]) == 2) and (int(date[0]) < 31) and (int(date[2]) < 31)):
                new_date = "20"+ date[2] + "-" +date[1]+"-" + date[0] 
            else:
                new_date = date[2]+"-"+date[1]+"-"+date[0] 
            return new_date        
    return '-'.join(date)

#### Project specific types

In [4]:
def sanitize_sex(sex):
    """Sex to 1 (male) or 0 (female)"""
    return 1 if sex.strip() == 'Male' else 0

def sanitize_pregnancy(data):
    """Set males to 'not pregnant'"""
    data.loc[(data.sex == 1),'pregnant'] = 0
    return data

def sanitize_age(data):
    """Unknown values to np.nan"""
    data.loc[(data.age == '-1'),'age'] = np.nan
    data.loc[(data.age == '??'),'age'] = np.nan
    return data

def sanitize_income(income):
    """Income to 0 if under 50k, to 1 if over 50k, to np.nan, if unknown"""
    if str(income).strip() == '<=50K':
        return 0
    elif str(income).strip() == '>50K':
        return 1
    else:
        return np.nan
        
def sanitize_relationship(relation):
    """Narrow value set to 'Married', 'Not-Married', 'Divorced/Widowed'"""
    if relation in ('Married', 'Husband', 'Wife'):
        return 'Married'
    if relation in ('Not-Married', 'Not-in-family', 'Unmarried', 'Own-child', 'Other-relative'):
        return 'Not-Married'
    if relation == 'Divorced/Widowed':
        return relation
    return np.nan

def sanitize_marital_status(married):
    """Narrow value set to 'Married', 'Not-Married', 'Divorced/Widowed'"""
    if married in ('Married', 'Married-civ-spouse', 'Married-spouse-absent', 'Married-AF-spouse'):
        return 'Married'
    if married in ('Not-Married', 'Never-married'):
        return 'Not-Married'
    if married in ('Separated', 'Divorced', 'Widowed'):
        return 'Divorced/Widowed'
    return married

def sanitize_work(string):
    """Change word separator to '_'"""
    try:
        return string.replace('-','_').capitalize()
    except AttributeError:
        return np.nan
    
# --------------------------------------------------------------------
        
def fill_null_age(age,date):
    if (date):
        return (measure_year - int(date.split('-')[0]))[0]
    return np.nan
    
def get_education_num(education, data):
    temp = 10.0
    for i in data['education-num'].loc[data.education == education]:
        if (i > 0) & (i < temp):
            temp = i
    return temp

def clear_nan(data, column):
    if data[column].isnull().sum() > 0:
            data.dropna(subset=[column], inplace=True)
    return data

from sklearn.impute import KNNImputer
def replace_null_values_with_KNN_imputer(dataframe, no_of_neighbours):
    numpy_column = dataframe[['kurtosis_glucose', 'mean_oxygen', 'age']].to_numpy()
    numpy_column = np.reshape(numpy_column, (-1,3))

    class_column = dataframe['class'].to_numpy()
    class_column = np.reshape(numpy_column, (-1,1))

    
    imputer = KNNImputer(n_neighbors=no_of_neighbours, weights="uniform")
    numpy_column = imputer.fit_transform(numpy_column, class_column)
    return numpy_column

def replace_null_values_with_KNN_imputer2(dataframe, column, no_of_neighbours):
    numpy_column = dataframe[column].to_numpy()
    numpy_column = np.reshape(numpy_column, (-1,1))
    class_column = dataframe['income'].to_numpy()
    class_column = np.reshape(numpy_column, (-1,1))    
    imputer = KNNImputer(n_neighbors=no_of_neighbours, weights="uniform")
    numpy_column = imputer.fit_transform(numpy_column, class_column)
    return numpy_column

def replace_with_quantiles(df, column):
    new_df = df.copy(deep = True)
    skew_val = sc.skew(new_df[column]) 
    
    if ((skew_val < -2) or (skew_val > 2)):  
        minimum = new_df[column].min()
        minimum = minimum + (-minimum - minimum)
        new_df[column] = np.log(new_df[column]+minimum)
    
    perc_95 = new_df[column].quantile(.95)   
    perc_05 =  new_df[column].quantile(.05)
    new_df.loc[new_df[column] < perc_05, column] = perc_05
    new_df.loc[new_df[column] > perc_95, column] = perc_95
    return new_df

### Data reparation
Using the above defined functions, we can handle all kinds of oddities in the imported data.

In [5]:
def repair_sanitazition(data):
    # Pregnancy
    data.pregnant = data.pregnant.map(sanitize_boolean)
    data = sanitize_pregnancy(data)
    
    # Sex
    data.sex = data.sex.map(lambda sex: sanitize_sex(sex))
    
    # Income
    data.income = data.income.map(lambda income: sanitize_income(income))    
    
    # Age
    data = sanitize_age(data)
    data.age = data.age.map(lambda age: sanitize_number(age))
    data.date_of_birth = data.date_of_birth.map(sanitize_date)
    
    # Split medical info object into separate data columns
    split_object = data['medical_info'].str.replace('{','').str.replace('\'','').str.replace('}', '').str.split(',', expand=True)
    i = 0
    while i < len(split_object.columns):
        info_pair = split_object[i].str.split(':', expand=True)
        data[info_pair[0][0]] = info_pair[1].astype(float)
        i += 1
    data.drop('medical_info', axis='columns', inplace=True)

    # Sanitize string values
    data['race'] = data['race'].map(sanitize_string)
    data.race = data.race.map(lambda race: 'Other' if race in ('Asian-Pac-Islander', 'Amer-Indian-Eskimo') else race)
    data['marital-status'] = data['marital-status'].map(sanitize_string)
    data['relationship'] = data['relationship'].map(sanitize_string)
    data['relationship'] = data['marital-status'].map(sanitize_marital_status)
    data['relationship'] = data['relationship'].map(sanitize_relationship)
    data.drop('marital-status', axis='columns', inplace=True)
    data['occupation'] = data['occupation'].map(sanitize_string)
    data['workclass'] = data['workclass'].map(sanitize_string)
    data['occupation'] = data['occupation'].map(sanitize_work)
    data['workclass'] = data['workclass'].map(sanitize_work)
    data['native-country'] = data['native-country'].map(sanitize_string)
    
    return data

def repair_fillinmissing(data):
    # calculation of age column values
    years = [];
    def find_measure_year(age, date_of_birth):
        if (pd.notnull(age)):
            years.append(int(date_of_birth.split('-')[0]) + int(age))
    data.apply(lambda x: find_measure_year(x.age,x.date_of_birth), axis = 1)
    measure_year = pd.Series(years).mode();

    # Fill in education numbers
    map = {}
    for i in data.education.unique():
        map[i] = (get_education_num(i, data))
    data.loc[:,'education-num'] = data.education.map(map)
    
    # Add hours_per_week
    data['hours-per-week'].fillna(round(data['hours-per-week'].mean(), 0), inplace=True)    

    # Add income
    regression = LinearRegression()
    df = data[['hours-per-week','income']]
    df2 = data[['hours-per-week','income']]
    df.dropna(axis=0, how='any', inplace=True)
    X = df['hours-per-week'].values.reshape(-1,1)
    regression.fit(X, df['income'])
    data_null = data['hours-per-week'].loc[(data['income'].isna()) & (~data['hours-per-week'].isna())]
    temp = regression.predict(np.array(data_null).reshape(-1,1))
    temp = [np.round(x, 0) for x in temp]
    data['income'].loc[(data['income'].isna()) & (~data['hours-per-week'].isna())] = temp

    # Add mean_oxygen and kurtosis_glucose
    data = clear_nan(data, 'class')
    df = data[['kurtosis_glucose', 'mean_oxygen', 'age', 'class']]
    numpy_array = replace_null_values_with_KNN_imputer(df, 5)
    data['kurtosis_glucose'] = numpy_array[:,[0]]
    data['mean_oxygen'] = numpy_array[:,[1]]
    data['age'] = numpy_array[:,[2]]

    # Add mean_glucose
    regression = LinearRegression()
    df = data[['kurtosis_glucose','mean_glucose']]
    df2 = data[['kurtosis_glucose','mean_glucose']]
    df.dropna(axis=0, how='any', inplace=True)
    X = df['kurtosis_glucose'].values.reshape(-1,1)
    regression.fit(X, df['mean_glucose'])
    data_null = data['kurtosis_glucose'].loc[(data['mean_glucose'].isna()) & (~data['kurtosis_glucose'].isna())]
    temp = regression.predict(np.array(data_null).reshape(-1,1))
    data['mean_glucose'].loc[(data['mean_glucose'].isna()) & (~data['kurtosis_glucose'].isna())] = temp
    
    # Add kurtosis_oxygen
    regression = LinearRegression()
    df = data[['mean_oxygen','kurtosis_oxygen']]
    df2 = data[['mean_oxygen','kurtosis_oxygen']]
    df.dropna(axis=0, how='any', inplace=True)
    X = df['mean_oxygen'].values.reshape(-1,1)
    regression.fit(X, df['kurtosis_oxygen'])
    data_null = data['mean_oxygen'].loc[(data['kurtosis_oxygen'].isna()) & (~data['mean_oxygen'].isna())]
    temp = regression.predict(np.array(data_null).reshape(-1,1))
    data['kurtosis_oxygen'].loc[(data['kurtosis_oxygen'].isna()) & (~data['mean_oxygen'].isna())] = temp

    # Add skewness_glucose, std_glucose, skewness_oxygen, std_oxygen
    regression = LinearRegression()
    df = data[['kurtosis_glucose','skewness_glucose']]
    df2 = data[['kurtosis_glucose','skewness_glucose']]
    df.dropna(axis=0, how='any', inplace=True)
    X = df['kurtosis_glucose'].values.reshape(-1,1)
    regression.fit(X, df['skewness_glucose'])
    data_null = data['kurtosis_glucose'].loc[(data['skewness_glucose'].isna()) & (~data['kurtosis_glucose'].isna())]
    temp = regression.predict(np.array(data_null).reshape(-1,1))
    data['skewness_glucose'].loc[(data['skewness_glucose'].isna()) & (~data['kurtosis_glucose'].isna())] = temp
    data = clear_nan(data, 'std_glucose')

    # Add capital_loss
    df = data[['capital-loss', 'income']]
    numpy_array = replace_null_values_with_KNN_imputer2(df, 'capital-loss', 5)
    data['capital-loss'] = numpy_array[:,[0]]
    data['capital-loss'] = np.round(data['capital-loss'], 0)
    
    # Add capital_gain
    df = data[['capital-gain', 'income']]
    numpy_array = replace_null_values_with_KNN_imputer2(df, 'capital-gain', 5)
    data['capital-gain'] = numpy_array[:,[0]]
    data['capital-gain'] = np.round(data['capital-gain'], 0)

    # Add zostavajucich atributov
    data.race = data.race.fillna(data.race.mode()[0])
    data.pregnant = data.pregnant.fillna(data.pregnant.mode()[0])
    data.relationship = data.relationship.fillna(data.relationship.mode()[0])
    data.education = data.education.fillna(data.education.mode()[0])
    data['occupation'] = data['occupation'].fillna(data.occupation.mode()[0])
    data['native-country'] = data['native-country'].fillna(data['native-country'].mode()[0])
    data['workclass'] = data['workclass'].fillna(data.workclass.mode()[0])
    
    return data    
    
def repair_dropvalues(data):
    # Remove erroneous values
    median = data[(data.age > 0)].groupby('sex', as_index=False).age.mean()
    data.loc[(data.age < 0), 'age'] = data[data.age < 0].age.map(lambda a: round(median.loc[0, 'age'], 0))

    # Drop unnecessary columns
    data.drop('Unnamed: 0_x', axis='columns', inplace=True)
    data.drop('Unnamed: 0_y', axis='columns', inplace=True)
    data.drop('fnlwgt', axis='columns', inplace=True)
    
    # Drop duplicates
    data = data.drop_duplicates(['name','address','date_of_birth'], keep="last")
    
    return data

def repair_removeoutliers(data):
    data = replace_with_quantiles(data, 'mean_glucose')
    data = replace_with_quantiles(data, 'kurtosis_glucose')
    data = replace_with_quantiles(data, 'mean_oxygen')
    data = replace_with_quantiles(data, 'kurtosis_oxygen')
    data = replace_with_quantiles(data, 'skewness_oxygen')
    data = replace_with_quantiles(data, 'std_oxygen')
    data = replace_with_quantiles(data, 'skewness_glucose')
    data = replace_with_quantiles(data, 'std_glucose')
    
    return data


In [6]:
def repair_data(data):
    data = repair_sanitazition(data)
    data = repair_dropvalues(data)
    data = repair_fillinmissing(data)
    data = repair_removeoutliers(data)
    
    return data

### Apply

We merge both the validation and training datasets and run the preprocessing function on them. 
Preprocessing should be successful in both cases.

In [7]:
warnings.filterwarnings('ignore')
train = pd.merge(personal_train,other_train,on=['name','address'], how = 'outer')
valid = pd.merge(personal_valid,other_valid,on=['name','address'], how = 'outer')
data_train = repair_data(train)
data_valid = repair_data(valid)

#### We can get a peek into the results:

In [8]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3914 entries, 0 to 3982
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3914 non-null   object 
 1   address           3914 non-null   object 
 2   race              3914 non-null   object 
 3   occupation        3914 non-null   object 
 4   pregnant          3914 non-null   float64
 5   education-num     3914 non-null   float64
 6   relationship      3914 non-null   object 
 7   capital-gain      3914 non-null   float64
 8   education         3914 non-null   object 
 9   class             3914 non-null   float64
 10  income            3914 non-null   float64
 11  native-country    3914 non-null   object 
 12  hours-per-week    3914 non-null   float64
 13  capital-loss      3914 non-null   float64
 14  workclass         3914 non-null   object 
 15  age               3914 non-null   float64
 16  sex               3914 non-null   int64  


In [9]:
data_valid.head(5)

Unnamed: 0,name,address,race,occupation,pregnant,education-num,relationship,capital-gain,education,class,...,sex,date_of_birth,mean_glucose,std_glucose,kurtosis_glucose,skewness_glucose,mean_oxygen,std_oxygen,kurtosis_oxygen,skewness_oxygen
0,Steven Sao,"83139 Erica Lights Apt. 701\nEast Billy, IN 37907",White,Machine_op_inspct,0.0,9.0,Not-Married,0.0,HS-grad,0.0,...,1,1960-01-02,109.796875,52.34954,0.366554,0.506528,0.683485,13.008583,10.187234,4.999245
1,Paul Le,"4644 Sims Pines Suite 561\nBrandonport, MN 78993",White,Other_service,1.0,9.0,Divorced/Widowed,0.0,HS-grad,0.0,...,0,1976-07-26,104.070312,39.286047,0.515515,0.986709,1.156808,20.992858,7.445504,4.139537
2,Richard Huey,"533 Lee Plains\nPittsberg, NV 72286",White,Craft_repair,0.0,7.0,Married,0.0,11th,0.0,...,1,1955-03-31,108.453125,45.116665,0.621189,0.933901,1.318406,21.438332,6.699747,3.95496
3,Michael Wright,"PSC 3426, Box 4890\nAPO AA 62246",White,Exec_managerial,0.0,9.0,Married,0.0,HS-grad,0.0,...,0,1976-11-04,122.40625,49.823036,0.312326,0.592041,0.822384,14.796695,8.620707,4.612307
4,Thomas Grace,"3106 Robin Knolls\nBrookeborough, RI 89626",Black,Prof_specialty,0.0,10.0,Married,0.0,Bachelors,1.0,...,1,1944-05-20,54.625,34.039499,2.202762,2.513813,3.905581,80.697436,1.304086,0.716079


In [10]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3914 entries, 0 to 3982
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3914 non-null   object 
 1   address           3914 non-null   object 
 2   race              3914 non-null   object 
 3   occupation        3914 non-null   object 
 4   pregnant          3914 non-null   float64
 5   education-num     3914 non-null   float64
 6   relationship      3914 non-null   object 
 7   capital-gain      3914 non-null   float64
 8   education         3914 non-null   object 
 9   class             3914 non-null   float64
 10  income            3914 non-null   float64
 11  native-country    3914 non-null   object 
 12  hours-per-week    3914 non-null   float64
 13  capital-loss      3914 non-null   float64
 14  workclass         3914 non-null   object 
 15  age               3914 non-null   float64
 16  sex               3914 non-null   int64  


In [11]:
data_valid.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1298 entries, 0 to 1360
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              1298 non-null   object 
 1   address           1298 non-null   object 
 2   race              1298 non-null   object 
 3   occupation        1298 non-null   object 
 4   pregnant          1298 non-null   float64
 5   education-num     1298 non-null   float64
 6   relationship      1298 non-null   object 
 7   capital-gain      1298 non-null   float64
 8   education         1298 non-null   object 
 9   class             1298 non-null   float64
 10  income            1298 non-null   float64
 11  native-country    1298 non-null   object 
 12  hours-per-week    1298 non-null   float64
 13  capital-loss      1298 non-null   float64
 14  workclass         1298 non-null   object 
 15  age               1298 non-null   float64
 16  sex               1298 non-null   int64  


# Implementation - Adaboost