In [None]:
# IMPORTANT - *** fields are placeholders for your code. If you are not sure what they should contain, check the full solution.

In [None]:
# First make sure to install all required packages.
# You can do it by running the following command:

In [None]:
# ]add Arrow CSV DataFrames Plots FreqTables StatsBase Chain

In [None]:
# If you launched Jupyter in directory with attached Project.toml and Manifest.toml
# use below command to install required packages with fixed versions. 
# Check Project introduction for more information.

In [None]:
# ] instantiate

In [None]:
# Import required libraries
import Downloads
import SHA
using Arrow
using Chain
using CSV
using DataFrames
using Plots
using FreqTables
using Statistics
using StatsBase

# Adult

In [None]:
# Define URL to Adult data and expected SHA1
const ADULT_TRAIN = "https://archive.ics.uci.edu/ml/" *
                    "machine-learning-databases/adult/adult.data"
const ADULT_TRAIN_NAME = "adult_train.txt"

const ADULT_TRAIN_SHA1 = [0xee, 0x86, 0xbb, 0xe5, 0x56,
                          0x57, 0x8f, 0x70, 0x9a, 0xe0,
                          0xfd, 0x00, 0x2a, 0xc5, 0x8a,
                          0xc9, 0x37, 0x26, 0x48, 0x2f];

In [None]:
# Download Adult data if not exists
if ***
    @info "$ADULT_TRAIN_NAME found. Skipping download."
else
    @info "$ADULT_TRAIN_NAME not found. Fetching from source."
    ***
end

In [None]:
# Check SHA1 of Adult file
if ADULT_TRAIN_SHA1 == ***
    @info "SHA1 check of $ADULT_TRAIN_NAME passed."
else
    error("$ADULT_TRAIN_NAME file has an invalid SHA1. Aborting!")
end

In [None]:
# We define column names to pass to CSV.read function
const COL_NAMES = [:age, :workclass, :fnlwgt, :education, :education_num,
                   :marital_status, :occupation, :relationship, :race,
                   :sex, :capital_gain, :capital_loss, :hours_per_week,
                   :native_country, :target];

In [None]:
# Read Adult CSV
adult_train = ***(ADULT_TRAIN_NAME, DataFrame, header=COL_NAMES,
                       delim=", ", missingstring="?", tasks=1)

In [None]:
# Adjust Jupyter's columns output to 200
ENV["COLUMNS"] = 200

In [None]:
# Adjust Jupyter's rows output to 20
ENV["LINES"] = 20

In [None]:
# Remove irrelevant fnlwgt feature
select!(adult_train, ***)

In [None]:
# Basic statistics of the dataset
describe(adult_train)

In [None]:
# To 'catch' all nominal variables we specify Union of String and Missing types
nominal = names(adult_train, ***)

In [None]:
# This helper function will aggregate each by each category in the column and calculate the counts
# Data is passed to the bar function to create a barplot
function bar_helper(column_name)
    agg = combine(***(adult_train, column_name, sort=true), ***)
    bar(string.(agg[!, column_name]), agg.nrow, xlabel=column_name,
        legend=false, xrotation = 45)
end

In [None]:
# Inspect distribution of nominal variables
foreach(display∘bar_helper, nominal)

In [None]:
# Inspect distribution of nominal variables
foreach(name -> println("\n", *** |> sort), nominal)

In [None]:
# Collapse categories in 'native_country' column
adult_train.native_country[Not(***)] .= "not-United-States";

In [None]:
# Check categories count after the operation
freqtable(adult_train.native_country)

In [None]:
#Produce two-way table to learn more about relation between 'maritial_status' and 'relationship' features
@chain adult_train begin
    groupby([***, ***])
    combine(***)
    unstack(:marital_status, :relationship, :nrow)
end

In [None]:
# Extract continuous features
continuous = names(adult_train, ***)

In [None]:
# Define a helper function for drawing of a single histogram
histogram_helper(column_name) =
    histogram(***)

# Compose a grid of histograms in a single plot
foreach(x -> display(histogram_helper(x)), continuous)

In [None]:
# Check how many 'capital_gain' observations are equal to 0
***(adult_train.capital_gain .== 0)

In [None]:
# Check how many 'capital_loss' observations are equal to 0
***(adult_train.capital_loss .== 0)

In [None]:
# Check the same mean as above, but grouped by `target`
# It seems that wealthy people more often gain money on capital market, but also lose money more often (which seems reasonable)
@chain adult_train begin
    groupby(***)
    combine([:capital_gain, :capital_loss] .=>
            (x -> ***) .=>
            [:freq0_capital_gain, :freq0_capital_loss])
end

In [None]:
# Save clean dataset as Arrow file
***("adult_train.arrow", adult_train)