# Manipulando e Visulizando Dados

In [None]:
using CSV, DataFrames 
using StatsBase, Gadfly
using Distributions, Random
using DataFramesMeta, Chain
using Colors, Dates, RDatasets
ENV["COLUMNS"] = 900

In [None]:
readdir()

## DataFrames.jl

In [None]:
df = CSV.read("iris.csv", DataFrame);

In [None]:
summary(df)

In [None]:
describe(df)

In [None]:
df[[1,2], [:Id, :Species]]

In [None]:
df[1,:]

### select() e select!()

In [None]:
select(df, r"Cm")

In [None]:
select(df, r"Petal")

In [None]:
select(df, :Id => :id, :Species => :species)

In [None]:
select(df, :Id => (x -> x.^2) => :id, :Species => :species)

In [None]:
prop = df |> x->
    select(x, r"Cm")

In [None]:
prop[!, 1] *= 8

In [None]:
prop

### transform() e transform!()

In [None]:
soma = df |> x ->
    select(x, r"Cm") |> x ->
    transform(x, All()=>+) # Not()

In [None]:
df |> x ->
    select(x, r"Cm") |> x ->
    transform(x, [:SepalLengthCm, :SepalWidthCm]=> +)

## DataFramesMeta

Macros 
- @orderby
- @where
- @with
- @eachrow
- @select
- @combine
- @linq

[Dplyr e LINQ vs DataFramesMeta](https://juliadata.github.io/DataFramesMeta.jl/stable/#Comparison-with-dplyr-and-LINQ)

In [None]:
df |> x ->
    @where(x, :Species .== "Iris-setosa") |> x -> 
    select(x, r"Cm") |> x ->
    @orderby(x, :SepalLengthCm)

In [None]:
@chain df begin
    @where(:Species .== "Iris-setosa") 
    select(r"Cm")
    @orderby(:SepalLengthCm)
end

In [None]:
@linq df |>
    where(:Species .== "Iris-setosa") |>
    orderby(:SepalLengthCm) |>
    select(:SepalWidthCm) 

In [None]:
let x = 0.0, y = 0.0
    @eachrow df begin
        if :SepalLengthCm * :SepalWidthCm > 5
            x += 1
            y += 2
        end
    end
    x, y
end

In [None]:
teste = @eachrow df begin
    @newcol colX::Vector{Float64}
    :colX = :Species == "Iris-setosa" ? :SepalWidthCm * :SepalLengthCm : 0
end

In [None]:
last(teste, 10)

## Algumas Estatísticas

In [None]:
mean(df[!,3])

In [None]:
variation(df[!,4])

In [None]:
corspearman(df[!,3], df[!,4])

# Missing Values

In [None]:
df = CSV.read("titanic.csv", DataFrame);

In [None]:
df

In [None]:
first(df,5)

In [None]:
describe(df)

In [None]:
select!(df, Not(:Cabin))

In [None]:
describe(df)

In [None]:
df.Age[ismissing.(df.Age)] .= begin 
    df |>
    dropmissing |> x->
    mean(x.Age)
end;
# @transform(df, Age=replace(:Age, missing =>35))
# df.Age = coalesce.(df.Age, 35)

In [None]:
describe(df)

In [None]:
dropmissing!(df)

In [None]:
describe(df)

# Gadfly

In [None]:
# layer
    # Dados
    # Aesthetics
    # Geometria
    # Colors
# Título e rótulos
# Scale, cores, Guides...
# Temas

In [None]:
plot(df.Age)

In [None]:
plot(x=df.Age, y=df.Age)

In [None]:
plot(df, x=:Age, y=:Survived)

In [None]:
plot(df, x=:Survived, Geom.histogram)

In [None]:
iris = CSV.read("iris.csv", DataFrame);
describe(iris)

In [None]:
plot(iris, x=:PetalLengthCm, y=:PetalWidthCm)

In [None]:
plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, color=:Species)

In [None]:
plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, color=:SepalLengthCm)

In [None]:
Gadfly.with_theme(:dark) do
    plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, color=:Species)
end

In [None]:
set_default_plot_size(21cm, 12cm)
plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species)

In [None]:
plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species, color=:SepalLengthCm)

In [None]:
x = 0:0.01:10
p = plot(x=x, y=cos.(x), Geom.line, Theme(default_color=colorant"purple"),
    layer(x=x, y=sin.(x), Geom.line, Theme(default_color=colorant"red")))

In [None]:
x1 = rand(Normal(1.0),100,2)
x2 = rand(Normal(4.0),100,2);

In [None]:
plot(layer(x=x1[:,1],y=x1[:,2], Theme(default_color=colorant"purple")),
    layer(x=x2[:,1],y=x2[:,2], Theme(default_color=colorant"red")),
    Guide.manual_color_key("Normal dists", # título da legenda
                        ["Normal(μ=1)","Normal(μ=4)"], # nomes das keys
                        ["purple", "red"]))  # cores

In [None]:
p1 = plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species)
p2 = plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species, color=:Species)

hstack(p1,p2)

In [None]:
p1 = plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species)
p2 = plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species, color=:Species)

vstack(p1,p2)

In [None]:
p1 = plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species)
p2 = plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species, color=:Species)

hstack(vstack(p1,p2),vstack(p1,p2))

In [None]:
p1 = plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species)
p2 = plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species, color=:Species)

gridstack([p1 p2; p2 p1])

In [None]:
plot(iris, x=:PetalLengthCm, y=:PetalWidthCm, shape=:Species, 
    color=:Species,
    Guide.title("Petal"),
    Guide.xlabel("Length (cm)"),
    Guide.ylabel("Width (cm)"))

In [None]:
# mudando as escalas

In [None]:
x = 1:100
y = x.^3;

In [None]:
plot(x=x, y=y, Geom.point, Scale.y_log10, Scale.x_log10)

In [None]:
plot(x=x, y=y, Geom.line, Scale.y_log10, Scale.x_log10)

In [None]:
p = plot(df, x=:Age, 
    Geom.histogram(bincount=10))

In [None]:
p = Gadfly.plot(df, x=:Age, 
    Geom.histogram())

In [None]:
p = plot(iris, x=:Species, y=:SepalLengthCm, Geom.violin, color=:Species)

In [None]:
p = plot(iris, x=:Species, y=:SepalLengthCm, Geom.boxplot, color=:Species)

In [None]:
x = 0.0:0.1:2.0
y = x.^2+rand(length(x));

In [None]:
p1 = plot(x=x, y=y, Geom.point, Geom.smooth(method=:loess,smoothing=0.9))
p2 = plot(x=x, y=y, Geom.point, Geom.smooth(method=:loess,smoothing=0.1))
hstack(p1,p2)

In [None]:
Dp = dataset("ggplot2","presidential")[3:end,:]
De = dataset("ggplot2","economics")
De.Unemploy /= 10^3;

In [None]:
plot(De, x=:Date, y=:Unemploy, Geom.line,
    layer(Dp, xmin=:Start, xmax=:End, Geom.vband, color=:Party, alpha=[0.7]),
    Scale.color_discrete_manual("deepskyblue", "lightcoral"),
    Coord.cartesian(xmin=Date("1965-01-01"), ymax=12),
    Guide.xlabel("Time"), Guide.ylabel("Unemployment (x10³)"), Guide.colorkey(title=""),
    Theme(default_color="black", key_position=:top))

In [None]:
x = 0:0.1:5
y = 2 .* x .+5 .+ rand(Normal(), length(x));

In [None]:
p = plot(x=x, y=y, Geom.point)

In [None]:
push!(p, layer(x=x, y=y, intercept=[5], slope=[2],Geom.abline(color="red", style=:dash)))