### We will first try to train and optimize some ML algorithms to predict the survival of individuals, using all available and meaningful data, and then, following the data analysis, we search for the best algorithm only for predicting the survival of males, following the schema:
- if female:
    - if all family died:
        - predict die
    - else predict survive
- if male:
    - if all family survives:
        - predict survive
    - else predict survival using ML on data: IsYoung, PClass, and Embarked

In [39]:
using Pkg
Pkg.add(["CSV", "DataFrames", "Statistics", "MLJ"])

[32m[1m   Resolving[22m[39m package versions...
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.6/Project.toml`
[32m[1m  No Changes[22m[39m to `~/.julia/environments/v1.6/Manifest.toml`


In [40]:
using CSV, DataFrames, Statistics, MLJ

In [41]:
data = CSV.File("data/train.csv") |> DataFrame;

In [42]:
function port_to_numerical(port)
    if ismissing(port) || port == "S"
        return 0 
    elseif port == "C"
        return 2
    else
        return 1
    end
end

port_to_numerical (generic function with 2 methods)

In [43]:
# regex to extract first name
family_name_regex = r"^([\w\-]+)"

function get_family_dict(data)
    family_dict = Dict()
    
    # extract string
    for value in eachrow(data)
        name = match(family_name_regex, value.Name).match
        if haskey(family_dict, name)
            # push survived value to set matching the name 
            push!(get(family_dict, name, nothing), value.Survived)
        else
            # if dict does not contain this family, create a new set containing the "Survived" value of this person
            family_dict[name] = [value.Survived]
        end
    end
    
    return family_dict
end

get_family_dict (generic function with 1 method)

In [44]:
# function returns 0 if all family died
# - 1 if the person is travelling alone or family has mixed survival
# - 2 if all family survived
function family_survived(example, family_dict)
    name = match(family_name_regex, example.Name).match
    
    if haskey(family_dict, name) && length(get(family_dict, name, nothing)) > 1
        value = get(family_dict, name, nothing)
        if mean(value) == 1
            # all family survived
            return 2
        elseif mean(value) == 0
            # all family died
            return 0
        end
        # mixed survival
        return 1
    else
        # person is travelling alone
        return 1
    end
end

family_survived (generic function with 1 method)

In [45]:
function pre_process(data, train=true)
    # Sex to numerical
    data.Sex = map(x -> x == "male" ? 0 : 1, data.Sex)
    
    # IsYoung / Age
    # TODO
    data.Age = map(x -> !ismissing(x) && x < 14 ? 1 : 0, data.Age)
    rename!(data, :Age => :IsYoung)
    
    # Embarked to numerical
    data.Embarked = map(x -> port_to_numerical(x), data.Embarked)
    
    # FamilySurvived
    family_dict = get_family_dict(data)
    data.FamilySurvived = map(x -> family_survived(x, family_dict), eachrow(data))
    
    # drop cols
    select!(data, Not([:PassengerId, :Name, :SibSp, :Parch, :Fare, :Ticket, :Cabin]))
    
    # scitype
    if train
        coerce!(data, :Survived => OrderedFactor)
    end
    coerce!(data, Count => Continuous)
end

pre_process (generic function with 2 methods)

In [46]:
pre_process(data)

Unnamed: 0_level_0,Survived,Pclass,Sex,IsYoung,Embarked,FamilySurvived
Unnamed: 0_level_1,Cat…,Float64,Float64,Float64,Float64,Float64
1,0,3.0,0.0,0.0,0.0,0.0
2,1,1.0,1.0,0.0,2.0,1.0
3,1,3.0,1.0,0.0,0.0,1.0
4,1,1.0,1.0,0.0,0.0,1.0
5,0,3.0,0.0,0.0,0.0,1.0
6,0,3.0,0.0,0.0,1.0,1.0
7,0,1.0,0.0,0.0,0.0,1.0
8,0,3.0,0.0,1.0,0.0,0.0
9,1,3.0,1.0,0.0,0.0,1.0
10,1,2.0,1.0,0.0,2.0,1.0
