In [1]:
using DelimitedFiles
using StatsBase
using DataFrames
using Plots
using Flux

Attribute Information:
1. name	Name of the country concerned
2. landmass	1=N.America, 2=S.America, 3=Europe, 4=Africa, 4=Asia, 6=Oceania
3. zone	Geographic quadrant, based on Greenwich and the Equator
               1=NE, 2=SE, 3=SW, 4=NW
4. area	in thousands of square km
5. population	in round millions
6. language 1=English, 2=Spanish, 3=French, 4=German, 5=Slavic, 6=Other 
            Indo-European, 7=Chinese, 8=Arabic, 
            9=Japanese/Turkish/Finnish/Magyar, 10=Others
7. religion 0=Catholic, 1=Other Christian, 2=Muslim, 3=Buddhist, 4=Hindu,
            5=Ethnic, 6=Marxist, 7=Others
8. bars     Number of vertical bars in the flag
9. stripes  Number of horizontal stripes in the flag
10. colours  Number of different colours in the flag
11. red      0 if red absent, 1 if red present in the flag
12. green    same for green
13. blue     same for blue
14. gold     same for gold (also yellow)
15. white    same for white
16. black    same for black
17. orange   same for orange (also brown)
18. mainhue  predominant colour in the flag (tie-breaks decided by taking
            the topmost hue, if that fails then the most central hue,
            and if that fails the leftmost hue)
19. circles  Number of circles in the flag
20. crosses  Number of (upright) crosses
21. saltires Number of diagonal crosses
22. quarters Number of quartered sections
23. sunstars Number of sun or star symbols
24. crescent 1 if a crescent moon symbol present, else 0
25. triangle 1 if any triangles present, 0 otherwise
26. icon     1 if an inanimate image present (e.g., a boat), otherwise 0
27. animate  1 if an animate image (e.g., an eagle, a tree, a human hand)
            present, 0 otherwise
28. text     1 if any letters or writing on the flag (e.g., a motto or
            slogan), 0 otherwise
29. topleft  colour in the top-left corner (moving right to decide 
            tie-breaks)
30. botright Colour in the bottom-left corner (moving left to decide 
            tie-breaks)

In [2]:
rawdata = readdlm("flag.data", ',')
df = DataFrame(rawdata, :auto);

In [3]:
names = ["name","landmass","zone", "area", "population","language", "religion",
"bars", "stripes",  "colours", "red", "green","blue", "gold", "white", "black", "orange",
"mainhue", "circles", "crosses", "saltires", "quarters", "sunstars", "crescent",
"triangle", "icon", "animate", "text", "topleft", "botright"]
rename!(df, names)

Row,name,landmass,zone,area,population,language,religion,bars,stripes,colours,red,green,blue,gold,white,black,orange,mainhue,circles,crosses,saltires,quarters,sunstars,crescent,triangle,icon,animate,text,topleft,botright
Unnamed: 0_level_1,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any,Any
1,Afghanistan,5,1,648,16,10,2,0,3,5,1,1,0,1,1,1,0,green,0,0,0,0,1,0,0,1,0,0,black,green
2,Albania,3,1,29,3,6,6,0,0,3,1,0,0,1,0,1,0,red,0,0,0,0,1,0,0,0,1,0,red,red
3,Algeria,4,1,2388,20,8,2,2,0,3,1,1,0,0,1,0,0,green,0,0,0,0,1,1,0,0,0,0,green,white
4,American-Samoa,6,3,0,0,1,1,0,0,5,1,0,1,1,1,0,1,blue,0,0,0,0,0,0,1,1,1,0,blue,red
5,Andorra,3,1,0,0,6,0,3,0,3,1,0,1,1,0,0,0,gold,0,0,0,0,0,0,0,0,0,0,blue,red
6,Angola,4,2,1247,7,10,5,0,2,3,1,0,0,1,0,1,0,red,0,0,0,0,1,0,0,1,0,0,red,black
7,Anguilla,1,4,0,0,1,1,0,1,3,0,0,1,0,1,0,1,white,0,0,0,0,0,0,0,0,1,0,white,blue
8,Antigua-Barbuda,1,4,0,0,1,1,0,1,5,1,0,1,1,1,1,0,red,0,0,0,0,1,0,1,0,0,0,black,red
9,Argentina,2,3,2777,28,2,0,0,3,2,0,0,1,0,1,0,0,blue,0,0,0,0,0,0,0,0,0,0,blue,blue
10,Argentine,2,3,2777,28,2,0,0,3,3,0,0,1,1,1,0,0,blue,0,0,0,0,1,0,0,0,0,0,blue,blue


   4. area	in thousands of square km
   5. population in round millions

   zatem 0 poniżej tych wartości
   <br>
   <br> icon and animate: kandydaci do zmergowania
   <br> topleft, botright, zone - do usunięcia
   <br> RELIGION marixists naprawic
   <br> 
   <br> POMYSŁ: logarytmizacja area (albo inna mądrzejsza funkcja)

In [4]:
# for i in names[2:30]
#     display(Plots.bar(collect(keys(countmap(df[:,i]))), collect(values(countmap(df[:,i]))),
#     orientation=:vertical, label = i))
# end

In [5]:
# VARIABLES TRANSFORMATIONS

df[!,:mainhue] = [x == "brown" ? 0 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "gold" ? 1 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "white" ? 2 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "orange" ? 3 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "blue" ? 4 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "green" ? 5 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "black" ? 6 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "red" ? 7 : x for x in df[!,:mainhue]]

df[!,:topleft] = [x == "gold" ? 1 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "white" ? 2 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "orange" ? 3 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "blue" ? 4 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "green" ? 5 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "black" ? 6 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "red" ? 7 : x for x in df[!,:topleft]]

df[!,:botright] = [x == "brown" ? 0 : x for x in df[!,:botright]]
df[!,:botright] = [x == "gold" ? 1 : x for x in df[!,:botright]]
df[!,:botright] = [x == "white" ? 2 : x for x in df[!,:botright]]
df[!,:botright] = [x == "orange" ? 3 : x for x in df[!,:botright]]
df[!,:botright] = [x == "blue" ? 4 : x for x in df[!,:botright]]
df[!,:botright] = [x == "green" ? 5 : x for x in df[!,:botright]]
df[!,:botright] = [x == "black" ? 6 : x for x in df[!,:botright]]
df[!,:botright] = [x == "red" ? 7 : x for x in df[!,:botright]]

# numbers of circles in the flag
df[!,:circles] = [x >= 1 ? 1 : 0 for x in df[!,:circles]]
# crosses upright
df[!,:crosses] = [x >= 1 ? 1 : 0 for x in df[!,:crosses]]
# saltires
df[!,:saltires] = [x >= 1 ? 1 : 0 for x in df[!,:saltires]]
# sunstars
df[!,:sunstars] = [x >= 1 ? 1 : 0 for x in df[!,:sunstars]];

# # removing botright and topleft

# df = df[:,Not([:topleft,:botright])]

#  df = df[(df.religion .!= "6"), :]

In [6]:
# # for now: RECODING RELIGION

# df[!,:religion] = [x <= 1 ? 1 : 0 for x in df[!,:religion]];     # 1 jeśli jest christian (catolics and other christians)
#                                                                 # 0 if other

In [7]:
# NORMALIZACJA

function scale(X)

    # zwraca wektor z parametrami dla każdej ze zmiennych
    μ_X = mean(X, dims=2)
    σ_X = std(X, dims=2)

    X_norm = (X .- μ_X) ./ σ_X

    return (X_norm, μ_X, σ_X);
end

scale (generic function with 1 method)

In [8]:
X = Matrix(df[:, Not(1, 7)])'
y = df[:, 7]

194-element Vector{Any}:
 2
 6
 2
 1
 0
 5
 1
 1
 0
 0
 1
 0
 1
 ⋮
 1
 1
 6
 1
 0
 0
 6
 1
 6
 5
 5
 5

In [9]:
X_norm, μ_X, σ_X = scale(X)
X_norm;

In [10]:
n_neurons = 1

W = rand(n_neurons, size(X_norm)[1])
b = rand(n_neurons)

layer₁(x) = σ.(W * x .+ b)
m = Chain(layer₁)

Chain(layer₁)

In [11]:
layer₁(X_norm)  # wartości na neuronach (jedna warstwa)

# albo tak, też dobry zapis:
m(X_norm)

1×194 Matrix{Float64}:
 0.97613  0.337818  0.429142  0.990591  …  0.956181  0.0657672  0.99866

In [12]:
# ŷ = deepcopy(vec(m(X_norm)'));

In [13]:
# loss(ŷ, y)  = Flux.crossentropy(ŷ,y)
# loss(ŷ, y)  = Flux.crossentropy(vec(m(X_norm)'),y)

loss(ŷ, y) = Flux.mse(vec(m(X_norm)'), y)
# opt_state = Flux.setup(Flux.Adam(0.01), m)

loss (generic function with 1 method)

In [14]:
loss(m(X_norm), y)

7.448267988682697

In [15]:
using LinearAlgebra
opt = ADAM()

Adam(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

<hr/>

Warto jednak zaznaczyć, że pozwala ona na uczenie jedynie przez pojedynczą epokę. Aby móc kontynuować proces uczenia dalej musimy w odpowiedni sposób przystować dane z których korzystamy:

In [16]:
using Base.Iterators: repeated
dataset = repeated((X_norm, y), 200)

Base.Iterators.Take{Base.Iterators.Repeated{Tuple{Matrix{Float64}, Vector{Any}}}}(Base.Iterators.Repeated{Tuple{Matrix{Float64}, Vector{Any}}}(([0.9193939892483797 -0.3684214180742606 … 0.2754862855870595 0.2754862855870595; -0.92590719196647 -0.92590719196647 … -0.16154125476861825 -0.16154125476861825; … ; 0.709452739052039 1.2230118113509028 … 0.1958936667531751 0.1958936667531751; -0.002740111349637368 1.0604230923096654 … -2.660648120497894 -0.002740111349637368], Any[2, 6, 2, 1, 0, 5, 1, 1, 0, 0  …  6, 1, 0, 0, 6, 1, 6, 5, 5, 5])), 200)

albo skorzystać z makra <tt>@epochs</tt>:

In [17]:
# Flux.@epochs

Pozwala ona też na definiowanie wywołań, które pozwolą nam kontrolować przebieg uczenia.

In [18]:
# evalcb = () -> @show(loss(tX, tY))

In [19]:
dataset = repeated((X_norm, y), 200)
evalcb = () -> @show(loss(X_norm, y))
opt = ADAM()

Adam(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

In [20]:
Flux.train!(loss, Flux.params(m), dataset, opt)
# Flux.train!(loss, Flux.params(m), dataset, opt, cb = throttle(evalcb, 10))
m(X_norm)

1×194 Matrix{Float64}:
 0.97613  0.337818  0.429142  0.990591  …  0.956181  0.0657672  0.99866

In [21]:
y_fitted = m(X_norm).>= 0.5
y_fitted .== y'
countmap(y_fitted .== y')

Dict{Bool, Int64} with 2 entries:
  0 => 135
  1 => 59

In [22]:
accuracy(X, y, T = 0.5) = sum((X.≥ T).== y')/length(y)
accuracy(m(X_norm), y)

0.30412371134020616

<hr/>

Może zastosować ten cały dropout np dla sieci skoro mamy mały zbiór?

In [23]:
# for i in 19:30
#     println(names[i])
#     println(countmap(df[:,i]))
# end

In [24]:
# for i in names[19:30]
#     display(Plots.bar(collect(keys(countmap(df[:,i]))), collect(values(countmap(df[:,i]))),
#     orientation=:vertical, label = i))
# end

In [25]:
histogram(df[:,7], bar_width = .8, label = "religion");

In [None]:
describe(df);
cmap_religion = countmap(df[:,7]);

In [None]:
describe(df);
cmap_religion = countmap(df[:,7]);

In [None]:
# tutaj są te ćwiartki, widać kolonie fajnie
df[df.quarters.>=1,:];

In [None]:
# stare kraje ZSRR oznaczone jako marxists
df[df.religion.==6,:];

In [None]:
# countries having less then 1 million population (chyba głównie wyspy)
df[df.population.==0,:];

In [None]:
# countries that have less than 1 thousands sqare meters
df[df.area.==0,:];