In [1]:
using DelimitedFiles
using StatsBase
using DataFrames
using Plots
using Flux

Attribute Information:
1. name	Name of the country concerned
2. landmass	1=N.America, 2=S.America, 3=Europe, 4=Africa, 4=Asia, 6=Oceania
3. zone	Geographic quadrant, based on Greenwich and the Equator
               1=NE, 2=SE, 3=SW, 4=NW
4. area	in thousands of square km
5. population	in round millions
6. language 1=English, 2=Spanish, 3=French, 4=German, 5=Slavic, 6=Other 
            Indo-European, 7=Chinese, 8=Arabic, 
            9=Japanese/Turkish/Finnish/Magyar, 10=Others
7. religion 0=Catholic, 1=Other Christian, 2=Muslim, 3=Buddhist, 4=Hindu,
            5=Ethnic, 6=Marxist, 7=Others
8. bars     Number of vertical bars in the flag
9. stripes  Number of horizontal stripes in the flag
10. colours  Number of different colours in the flag
11. red      0 if red absent, 1 if red present in the flag
12. green    same for green
13. blue     same for blue
14. gold     same for gold (also yellow)
15. white    same for white
16. black    same for black
17. orange   same for orange (also brown)
18. mainhue  predominant colour in the flag (tie-breaks decided by taking
            the topmost hue, if that fails then the most central hue,
            and if that fails the leftmost hue)
19. circles  Number of circles in the flag
20. crosses  Number of (upright) crosses
21. saltires Number of diagonal crosses
22. quarters Number of quartered sections
23. sunstars Number of sun or star symbols
24. crescent 1 if a crescent moon symbol present, else 0
25. triangle 1 if any triangles present, 0 otherwise
26. icon     1 if an inanimate image present (e.g., a boat), otherwise 0
27. animate  1 if an animate image (e.g., an eagle, a tree, a human hand)
            present, 0 otherwise
28. text     1 if any letters or writing on the flag (e.g., a motto or
            slogan), 0 otherwise
29. topleft  colour in the top-left corner (moving right to decide 
            tie-breaks)
30. botright Colour in the bottom-left corner (moving left to decide 
            tie-breaks)

In [2]:
rawdata = readdlm("flag.data", ',')
df = DataFrame(rawdata, :auto);

In [3]:
names = ["name","landmass","zone", "area", "population","language", "religion",
"bars", "stripes",  "colours", "red", "green","blue", "gold", "white", "black", "orange",
"mainhue", "circles", "crosses", "saltires", "quarters", "sunstars", "crescent",
"triangle", "icon", "animate", "text", "topleft", "botright"]
rename!(df, names);

   4. area	in thousands of square km
   5. population in round millions

   zatem 0 poniżej tych wartości
   <br>
   <br> icon and animate: kandydaci do zmergowania
   <br> topleft, botright, zone - do usunięcia
   <br> RELIGION marixists naprawic
   <br> 
   <br> POMYSŁ: logarytmizacja area (albo inna mądrzejsza funkcja)

In [4]:
# tutaj są te ćwiartki, widać kolonie fajnie
df[df.quarters.>=1,:];

In [5]:
# stare kraje ZSRR oznaczone jako marxists
df[df.religion.==6,:];

In [6]:
# for Poland
df[df.name.=="Poland",:];

In [7]:
# countries having less then 1 million population (chyba głównie wyspy)
df[df.population.==0,:];

In [8]:
# countries that have less than 1 thousands sqare meters
df[df.area.==0,:];

In [9]:
# for i in names[2:30]
#     display(Plots.bar(collect(keys(countmap(df[:,i]))), collect(values(countmap(df[:,i]))),
#     orientation=:vertical, label = i))
# end

In [10]:
countmap(df[:,:botright])

Dict{Any, Int64} with 8 entries:
  "brown"  => 2
  "gold"   => 9
  "white"  => 17
  "orange" => 1
  "blue"   => 47
  "green"  => 40
  "black"  => 9
  "red"    => 69

In [11]:
# VARIABLES TRANSFORMATIONS

df[!,:mainhue] = [x == "brown" ? 0 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "gold" ? 1 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "white" ? 2 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "orange" ? 3 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "blue" ? 4 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "green" ? 5 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "black" ? 6 : x for x in df[!,:mainhue]]
df[!,:mainhue] = [x == "red" ? 7 : x for x in df[!,:mainhue]]

df[!,:topleft] = [x == "gold" ? 1 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "white" ? 2 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "orange" ? 3 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "blue" ? 4 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "green" ? 5 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "black" ? 6 : x for x in df[!,:topleft]]
df[!,:topleft] = [x == "red" ? 7 : x for x in df[!,:topleft]]

df[!,:botright] = [x == "brown" ? 0 : x for x in df[!,:botright]]
df[!,:botright] = [x == "gold" ? 1 : x for x in df[!,:botright]]
df[!,:botright] = [x == "white" ? 2 : x for x in df[!,:botright]]
df[!,:botright] = [x == "orange" ? 3 : x for x in df[!,:botright]]
df[!,:botright] = [x == "blue" ? 4 : x for x in df[!,:botright]]
df[!,:botright] = [x == "green" ? 5 : x for x in df[!,:botright]]
df[!,:botright] = [x == "black" ? 6 : x for x in df[!,:botright]]
df[!,:botright] = [x == "red" ? 7 : x for x in df[!,:botright]]

# numbers of circles in the flag
df[!,:circles] = [x >= 1 ? 1 : 0 for x in df[!,:circles]]
# crosses upright
df[!,:crosses] = [x >= 1 ? 1 : 0 for x in df[!,:crosses]]
# saltires
df[!,:saltires] = [x >= 1 ? 1 : 0 for x in df[!,:saltires]]
# sunstars
df[!,:sunstars] = [x >= 1 ? 1 : 0 for x in df[!,:sunstars]];

In [12]:
# # for now: RECODING RELIGION

# df[!,:religion] = [x <= 1 ? 1 : 0 for x in df[!,:religion]];     # 1 jeśli jest christian (catolics and other christians)
#                                                                 # 0 if other

In [13]:
# NORMALIZACJA

function scale(X)

    # zwraca wektor z parametrami dla każdej ze zmiennych
    μ_X = mean(X, dims=2)
    σ_X = std(X, dims=2)

    X_norm = (X .- μ_X) ./ σ_X

    return (X_norm, μ_X, σ_X);
end

# function scale(y, μ_X, σ_X)
#     y_norm = (y .- μ_X) ./ σ_X
#     return y_norm;
# end

scale (generic function with 1 method)

In [14]:
X = Matrix(df[:, Not(1, 7)])'
y = df[:, 7];

In [15]:
X_norm, μ_X, σ_X = scale(X)
X_norm;

In [16]:
n_neurons = 1

W = rand(n_neurons, size(X_norm)[1])
b = rand(n_neurons)

layer₁(x) = σ.(W * x .+ b)
m = Chain(layer₁)

Chain(layer₁)

In [17]:
layer₁(X_norm)  # wartości na neuronach

1×194 Matrix{Float64}:
 0.97328  0.352178  0.614596  0.998077  …  0.995687  0.941874  0.999387

In [18]:
ŷ = deepcopy(vec(m(X_norm)'));

In [19]:
# loss(ŷ, y)  = Flux.crossentropy(ŷ,y)
loss(ŷ, y)  = Flux.crossentropy(ŷ,y)

loss (generic function with 1 method)

In [20]:
size(ŷ)
loss(ŷ, y)

1399.909559640892

In [21]:
# opt = ADAM(0.0001)

In [22]:
using LinearAlgebra

dataset = Iterators.repeated((X_norm, y), 200)
evalcb = () -> @show(loss(X_train, Y))
opt = ADAM()


# accuracy(X_norm, y) # chcemy żeby accuracy było mniej więcej 0.1

Adam(0.001, (0.9, 0.999), 1.0e-8, IdDict{Any, Any}())

In [23]:
Flux.train!(loss, Flux.params(m), dataset = (X_norm, y), opt, cb = throttle(evalcb, 8))

LoadError: UndefVarError: throttle not defined

<hr/>

Może zastosować ten cały dropout np dla sieci skoro mamy mały zbiór?

In [None]:
# for i in 19:30
#     println(names[i])
#     println(countmap(df[:,i]))
# end

In [None]:
# for i in names[19:30]
#     display(Plots.bar(collect(keys(countmap(df[:,i]))), collect(values(countmap(df[:,i]))),
#     orientation=:vertical, label = i))
# end

In [None]:
histogram(df[:,7], bar_width = .8, label = "religion");

In [None]:
describe(df);
cmap_religion = countmap(df[:,7]);

In [None]:
describe(df);
cmap_religion = countmap(df[:,7]);