In [1]:
import IJulia
import Base64

# The julia kernel has built in support for Revise.jl, so this is the 
# recommended approach for long-running sessions:
# https://github.com/JuliaLang/IJulia.jl/blob/9b10fa9b879574bbf720f5285029e07758e50a5e/src/kernel.jl#L46-L51

# Users should enable revise within .julia/config/startup_ijulia.jl:
# https://timholy.github.io/Revise.jl/stable/config/#Using-Revise-automatically-within-Jupyter/IJulia-1

# clear console history
IJulia.clear_history()

fig_width = 5
fig_height = 4
fig_format = :png
fig_dpi = 96

# no retina format type, use svg for high quality type/marks
if fig_format == :retina
  fig_format = :svg
elseif fig_format == :pdf
  fig_dpi = 96
  # Enable PDF support for IJulia
  IJulia.register_mime(MIME("application/pdf"))
end

# convert inches to pixels
fig_width = fig_width * fig_dpi
fig_height = fig_height * fig_dpi

# Intialize Plots w/ default fig width/height
try
  import Plots

  # Plots.jl doesn't support PDF output for versions < 1.28.1
  # so use png (if the DPI remains the default of 300 then set to 96)
  if (Plots._current_plots_version < v"1.28.1") & (fig_format == :pdf)
    Plots.gr(size=(fig_width, fig_height), fmt = :png, dpi = fig_dpi)
  else
    Plots.gr(size=(fig_width, fig_height), fmt = fig_format, dpi = fig_dpi)
  end
catch e
  # @warn "Plots init" exception=(e, catch_backtrace())
end

# Initialize CairoMakie with default fig width/height
try
  import CairoMakie

  # CairoMakie's display() in PDF format opens an interactive window
  # instead of saving to the ipynb file, so we don't do that.
  # https://github.com/quarto-dev/quarto-cli/issues/7548
  if fig_format == :pdf
    CairoMakie.activate!(type = "png")
  else
    CairoMakie.activate!(type = string(fig_format))
  end
  CairoMakie.update_theme!(resolution=(fig_width, fig_height))
catch e
    # @warn "CairoMakie init" exception=(e, catch_backtrace())
end
  
# Set run_path if specified
try
  run_path = "L3J1bi9tZWRpYS9hbGYvZGF0b3MvbWlzcmVwb3NpdG9yaW9zL2RvY2VuY2lhL2FwcmVuZGl6YWplLWF1dG9tYXRpY28tcHJhY3RpY2FzLWp1bGlh"
  if !isempty(run_path)
    run_path = String(Base64.base64decode(run_path))
    cd(run_path)
  end
catch e
  @warn "Run path init:" exception=(e, catch_backtrace())
end


# emulate old Pkg.installed beahvior, see
# https://discourse.julialang.org/t/how-to-use-pkg-dependencies-instead-of-pkg-installed/36416/9
import Pkg
function isinstalled(pkg::String)
  any(x -> x.name == pkg && x.is_direct_dep, values(Pkg.dependencies()))
end

# ojs_define
if isinstalled("JSON") && isinstalled("DataFrames")
  import JSON, DataFrames
  global function ojs_define(; kwargs...)
    convert(x) = x
    convert(x::DataFrames.AbstractDataFrame) = Tables.rows(x)
    content = Dict("contents" => [Dict("name" => k, "value" => convert(v)) for (k, v) in kwargs])
    tag = "<script type='ojs-define'>$(JSON.json(content))</script>"
    IJulia.display(MIME("text/html"), tag)
  end
elseif isinstalled("JSON")
  import JSON
  global function ojs_define(; kwargs...)
    content = Dict("contents" => [Dict("name" => k, "value" => v) for (k, v) in kwargs])
    tag = "<script type='ojs-define'>$(JSON.json(content))</script>"
    IJulia.display(MIME("text/html"), tag)
  end
else
  global function ojs_define(; kwargs...)
    @warn "JSON package not available. Please install the JSON.jl package to use ojs_define."
  end
end


# don't return kernel dependencies (b/c Revise should take care of dependencies)
nothing


In [2]:
using DataFrames
df = DataFrame(
    Mes = ["Enero", "Febrero", "Marzo", "Abril"],
    Ingresos = [45000, 41500, 51200, 49700],
    Gastos = [33400, 35400, 35600, 36300],
    Impuestos = [6450, 6300, 7100, 6850]
    )

Row,Mes,Ingresos,Gastos,Impuestos
Unnamed: 0_level_1,String,Int64,Int64,Int64
1,Enero,45000,33400,6450
2,Febrero,41500,35400,6300
3,Marzo,51200,35600,7100
4,Abril,49700,36300,6850


In [3]:
df.Beneficios = df.Ingresos - df.Gastos - df.Impuestos
df

Row,Mes,Ingresos,Gastos,Impuestos,Beneficios
Unnamed: 0_level_1,String,Int64,Int64,Int64,Int64
1,Enero,45000,33400,6450,5150
2,Febrero,41500,35400,6300,-200
3,Marzo,51200,35600,7100,8500
4,Abril,49700,36300,6850,6550


In [4]:
df.Balance = ifelse.(df.Beneficios .> 0, "positivo", "negativo")
df

Row,Mes,Ingresos,Gastos,Impuestos,Beneficios,Balance
Unnamed: 0_level_1,String,Int64,Int64,Int64,Int64,String
1,Enero,45000,33400,6450,5150,positivo
2,Febrero,41500,35400,6300,-200,negativo
3,Marzo,51200,35600,7100,8500,positivo
4,Abril,49700,36300,6850,6550,positivo


In [5]:
df[df.Balance .== "positivo", [:Mes, :Beneficios]]

Row,Mes,Beneficios
Unnamed: 0_level_1,String,Int64
1,Enero,5150
2,Marzo,8500
3,Abril,6550


In [6]:
using CSV
df = CSV.read(download("https://aprendeconalf.es/aprendizaje-automatico-practicas-julia/datos/colesterol.csv"), DataFrame)

Row,nombre,edad,sexo,peso,altura,colesterol
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64?
1,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0
2,Rosa Díaz Díaz,32,M,65.0,1.73,232.0
3,Javier García Sánchez,24,H,missing,1.81,191.0
4,Carmen López Pinzón,35,M,65.0,1.7,200.0
5,Marisa López Collado,46,M,51.0,1.58,148.0
6,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0
7,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0
8,Pilar Martín González,22,M,60.0,1.66,missing
9,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0
10,Santiago Reillo Manzano,46,H,75.0,1.85,280.0


In [7]:
df.imc = df.peso ./ (df.altura .^ 2)
df

Row,nombre,edad,sexo,peso,altura,colesterol,imc
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64?,Float64?
1,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0,26.5285
2,Rosa Díaz Díaz,32,M,65.0,1.73,232.0,21.7181
3,Javier García Sánchez,24,H,missing,1.81,191.0,missing
4,Carmen López Pinzón,35,M,65.0,1.7,200.0,22.4913
5,Marisa López Collado,46,M,51.0,1.58,148.0,20.4294
6,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,21.7994
7,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,20.9573
8,Pilar Martín González,22,M,60.0,1.66,missing,21.7738
9,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0,23.9133
10,Santiago Reillo Manzano,46,H,75.0,1.85,280.0,21.9138


In [8]:
using CategoricalArrays
df.obesidad = cut(df.imc, [0, 18.5, 24.5, 30, Inf],
                labels=["Bajo peso", "Saludable", "Sobrepeso", "Obeso"],
                extend=true)
df

Row,nombre,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64?,Float64?,Cat…?
1,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0,26.5285,Sobrepeso
2,Rosa Díaz Díaz,32,M,65.0,1.73,232.0,21.7181,Saludable
3,Javier García Sánchez,24,H,missing,1.81,191.0,missing,missing
4,Carmen López Pinzón,35,M,65.0,1.7,200.0,22.4913,Saludable
5,Marisa López Collado,46,M,51.0,1.58,148.0,20.4294,Saludable
6,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,21.7994,Saludable
7,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,20.9573,Saludable
8,Pilar Martín González,22,M,60.0,1.66,missing,21.7738,Saludable
9,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0,23.9133,Saludable
10,Santiago Reillo Manzano,46,H,75.0,1.85,280.0,21.9138,Saludable


In [9]:
df[:, [:nombre, :sexo, :edad]]

Row,nombre,sexo,edad
Unnamed: 0_level_1,String,String1,Int64
1,José Luis Martínez Izquierdo,H,18
2,Rosa Díaz Díaz,M,32
3,Javier García Sánchez,H,24
4,Carmen López Pinzón,M,35
5,Marisa López Collado,M,46
6,Antonio Ruiz Cruz,H,68
7,Antonio Fernández Ocaña,H,51
8,Pilar Martín González,M,22
9,Pedro Gálvez Tenorio,H,35
10,Santiago Reillo Manzano,H,46


In [10]:
select(df, Not(:nombre))

Row,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,Int64,String1,Float64?,Float64,Float64?,Float64?,Cat…?
1,18,H,85.0,1.79,182.0,26.5285,Sobrepeso
2,32,M,65.0,1.73,232.0,21.7181,Saludable
3,24,H,missing,1.81,191.0,missing,missing
4,35,M,65.0,1.7,200.0,22.4913,Saludable
5,46,M,51.0,1.58,148.0,20.4294,Saludable
6,68,H,66.0,1.74,249.0,21.7994,Saludable
7,51,H,62.0,1.72,276.0,20.9573,Saludable
8,22,M,60.0,1.66,missing,21.7738,Saludable
9,35,H,90.0,1.94,241.0,23.9133,Saludable
10,46,H,75.0,1.85,280.0,21.9138,Saludable


In [11]:
select(df, Cols(:sexo, :edad, Not(:sexo, :edad)))

Row,sexo,edad,nombre,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String1,Int64,String,Float64?,Float64,Float64?,Float64?,Cat…?
1,H,18,José Luis Martínez Izquierdo,85.0,1.79,182.0,26.5285,Sobrepeso
2,M,32,Rosa Díaz Díaz,65.0,1.73,232.0,21.7181,Saludable
3,H,24,Javier García Sánchez,missing,1.81,191.0,missing,missing
4,M,35,Carmen López Pinzón,65.0,1.7,200.0,22.4913,Saludable
5,M,46,Marisa López Collado,51.0,1.58,148.0,20.4294,Saludable
6,H,68,Antonio Ruiz Cruz,66.0,1.74,249.0,21.7994,Saludable
7,H,51,Antonio Fernández Ocaña,62.0,1.72,276.0,20.9573,Saludable
8,M,22,Pilar Martín González,60.0,1.66,missing,21.7738,Saludable
9,H,35,Pedro Gálvez Tenorio,90.0,1.94,241.0,23.9133,Saludable
10,H,46,Santiago Reillo Manzano,75.0,1.85,280.0,21.9138,Saludable


In [12]:
df[df.sexo .== "M", :]

Row,nombre,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64?,Float64?,Cat…?
1,Rosa Díaz Díaz,32,M,65.0,1.73,232.0,21.7181,Saludable
2,Carmen López Pinzón,35,M,65.0,1.7,200.0,22.4913,Saludable
3,Marisa López Collado,46,M,51.0,1.58,148.0,20.4294,Saludable
4,Pilar Martín González,22,M,60.0,1.66,missing,21.7738,Saludable
5,Macarena Álvarez Luna,53,M,55.0,1.62,262.0,20.9572,Saludable
6,Carolina Rubio Moreno,20,M,61.0,1.77,194.0,19.4708,Saludable


In [13]:
df[(df.sexo .== "H") .& (df.edad .> 30), :]

Row,nombre,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64?,Float64?,Cat…?
1,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,21.7994,Saludable
2,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,20.9573,Saludable
3,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0,23.9133,Saludable
4,Santiago Reillo Manzano,46,H,75.0,1.85,280.0,21.9138,Saludable
5,José María de la Guía Sanz,58,H,78.0,1.87,198.0,22.3055,Saludable


In [14]:
dropmissing(df)

Row,nombre,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String,Int64,String1,Float64,Float64,Float64,Float64,Cat…
1,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0,26.5285,Sobrepeso
2,Rosa Díaz Díaz,32,M,65.0,1.73,232.0,21.7181,Saludable
3,Carmen López Pinzón,35,M,65.0,1.7,200.0,22.4913,Saludable
4,Marisa López Collado,46,M,51.0,1.58,148.0,20.4294,Saludable
5,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,21.7994,Saludable
6,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,20.9573,Saludable
7,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0,23.9133,Saludable
8,Santiago Reillo Manzano,46,H,75.0,1.85,280.0,21.9138,Saludable
9,Macarena Álvarez Luna,53,M,55.0,1.62,262.0,20.9572,Saludable
10,José María de la Guía Sanz,58,H,78.0,1.87,198.0,22.3055,Saludable


In [15]:
dropmissing(df, :colesterol)    

Row,nombre,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64,Float64?,Cat…?
1,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0,26.5285,Sobrepeso
2,Rosa Díaz Díaz,32,M,65.0,1.73,232.0,21.7181,Saludable
3,Javier García Sánchez,24,H,missing,1.81,191.0,missing,missing
4,Carmen López Pinzón,35,M,65.0,1.7,200.0,22.4913,Saludable
5,Marisa López Collado,46,M,51.0,1.58,148.0,20.4294,Saludable
6,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,21.7994,Saludable
7,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,20.9573,Saludable
8,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0,23.9133,Saludable
9,Santiago Reillo Manzano,46,H,75.0,1.85,280.0,21.9138,Saludable
10,Macarena Álvarez Luna,53,M,55.0,1.62,262.0,20.9572,Saludable


In [16]:
using Statistics
media_colesterol = mean(skipmissing(df.colesterol))
df.colesterol = coalesce.(df.colesterol, media_colesterol)
df

Row,nombre,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64,Float64?,Cat…?
1,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0,26.5285,Sobrepeso
2,Rosa Díaz Díaz,32,M,65.0,1.73,232.0,21.7181,Saludable
3,Javier García Sánchez,24,H,missing,1.81,191.0,missing,missing
4,Carmen López Pinzón,35,M,65.0,1.7,200.0,22.4913,Saludable
5,Marisa López Collado,46,M,51.0,1.58,148.0,20.4294,Saludable
6,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,21.7994,Saludable
7,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,20.9573,Saludable
8,Pilar Martín González,22,M,60.0,1.66,220.231,21.7738,Saludable
9,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0,23.9133,Saludable
10,Santiago Reillo Manzano,46,H,75.0,1.85,280.0,21.9138,Saludable


In [17]:
sort(df, :nombre)

Row,nombre,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64,Float64?,Cat…?
1,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,20.9573,Saludable
2,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,21.7994,Saludable
3,Carmen López Pinzón,35,M,65.0,1.7,200.0,22.4913,Saludable
4,Carolina Rubio Moreno,20,M,61.0,1.77,194.0,19.4708,Saludable
5,Javier García Sánchez,24,H,missing,1.81,191.0,missing,missing
6,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0,26.5285,Sobrepeso
7,José María de la Guía Sanz,58,H,78.0,1.87,198.0,22.3055,Saludable
8,Macarena Álvarez Luna,53,M,55.0,1.62,262.0,20.9572,Saludable
9,Marisa López Collado,46,M,51.0,1.58,148.0,20.4294,Saludable
10,Miguel Angel Cuadrado Gutiérrez,27,H,109.0,1.98,210.0,27.8033,Sobrepeso


In [18]:
sort(df, [:sexo, :edad], rev=[false, true])

Row,nombre,edad,sexo,peso,altura,colesterol,imc,obesidad
Unnamed: 0_level_1,String,Int64,String1,Float64?,Float64,Float64,Float64?,Cat…?
1,Antonio Ruiz Cruz,68,H,66.0,1.74,249.0,21.7994,Saludable
2,José María de la Guía Sanz,58,H,78.0,1.87,198.0,22.3055,Saludable
3,Antonio Fernández Ocaña,51,H,62.0,1.72,276.0,20.9573,Saludable
4,Santiago Reillo Manzano,46,H,75.0,1.85,280.0,21.9138,Saludable
5,Pedro Gálvez Tenorio,35,H,90.0,1.94,241.0,23.9133,Saludable
6,Miguel Angel Cuadrado Gutiérrez,27,H,109.0,1.98,210.0,27.8033,Sobrepeso
7,Javier García Sánchez,24,H,missing,1.81,191.0,missing,missing
8,José Luis Martínez Izquierdo,18,H,85.0,1.79,182.0,26.5285,Sobrepeso
9,Macarena Álvarez Luna,53,M,55.0,1.62,262.0,20.9572,Saludable
10,Marisa López Collado,46,M,51.0,1.58,148.0,20.4294,Saludable


In [19]:
using CSV, DataFrames
df = CSV.read(download("https://aprendeconalf.es/aprendizaje-automatico-practicas-julia/datos/notas-curso2.csv"), DataFrame; missingstring="NA")

Row,sexo,turno,grupo,trabaja,notaA,notaB,notaC,notaD,notaE
Unnamed: 0_level_1,String7,String7,String1,String1,Float64,Float64?,Float64?,Float64?,Float64?
1,Mujer,Tarde,C,N,5.2,6.3,3.4,2.3,2.0
2,Hombre,Mañana,A,N,5.7,5.7,4.2,3.5,2.7
3,Hombre,Mañana,B,N,8.3,8.8,8.8,8.0,5.5
4,Hombre,Mañana,B,N,6.1,6.8,4.0,3.5,2.2
5,Hombre,Mañana,A,N,6.2,9.0,5.0,4.4,3.7
6,Hombre,Mañana,A,S,8.6,8.9,9.5,8.4,3.9
7,Mujer,Mañana,A,N,6.7,7.9,5.6,4.8,4.2
8,Mujer,Tarde,C,S,4.1,5.2,1.7,0.3,1.0
9,Hombre,Tarde,C,N,5.0,5.0,3.3,2.7,6.0
10,Hombre,Tarde,C,N,5.3,6.3,4.8,3.6,2.3


In [20]:
describe(df)[:, [:variable, :nmissing]]

Row,variable,nmissing
Unnamed: 0_level_1,Symbol,Int64
1,sexo,0
2,turno,0
3,grupo,0
4,trabaja,0
5,notaA,0
6,notaB,5
7,notaC,1
8,notaD,2
9,notaE,2


In [21]:
using OneHotArrays
codificacion = permutedims(onehotbatch(df.grupo, unique(df.grupo)))
hcat(df, DataFrame(codificacion, :auto))

Row,sexo,turno,grupo,trabaja,notaA,notaB,notaC,notaD,notaE,x1,x2,x3
Unnamed: 0_level_1,String7,String7,String1,String1,Float64,Float64?,Float64?,Float64?,Float64?,Bool,Bool,Bool
1,Mujer,Tarde,C,N,5.2,6.3,3.4,2.3,2.0,true,false,false
2,Hombre,Mañana,A,N,5.7,5.7,4.2,3.5,2.7,false,true,false
3,Hombre,Mañana,B,N,8.3,8.8,8.8,8.0,5.5,false,false,true
4,Hombre,Mañana,B,N,6.1,6.8,4.0,3.5,2.2,false,false,true
5,Hombre,Mañana,A,N,6.2,9.0,5.0,4.4,3.7,false,true,false
6,Hombre,Mañana,A,S,8.6,8.9,9.5,8.4,3.9,false,true,false
7,Mujer,Mañana,A,N,6.7,7.9,5.6,4.8,4.2,false,true,false
8,Mujer,Tarde,C,S,4.1,5.2,1.7,0.3,1.0,true,false,false
9,Hombre,Tarde,C,N,5.0,5.0,3.3,2.7,6.0,true,false,false
10,Hombre,Tarde,C,N,5.3,6.3,4.8,3.6,2.3,true,false,false
