In [None]:
println("Julia version: ", VERSION)

In [None]:
using DataFrames
using Plots
using FreqTables
using TextAnalysis
using Languages
using WordCloud

In [None]:
Mystopwords = stopwords(Languages.Portuguese)
println("Number of stopwords: ",length(Mystopwords))

Create a corpus

In [None]:
listAtas = readdir("../atas");
corpus = []

for i in 1:length(listAtas)
    f = open("../atas/" * listAtas[i])
    txt = readlines(f)
    push!(corpus,txt)
end

println(length(corpus), " atas" )

In [None]:
docs = []
for i in 1:length(listAtas)
    fd = FileDocument("../atas/" * listAtas[i]);
    language!(fd, Languages.Portuguese())
    push!(docs,fd)
end
crps = Corpus(docs)
standardize!(crps, StringDocument)
remove_case!(crps)
prepare!(crps, strip_numbers)
prepare!(crps, strip_punctuation)
update_lexicon!(crps)
update_inverse_index!(crps)
crps

### char frequency


In [None]:
dict_char = Dict{Char, Int}()
for txt in corpus
    for char in string(txt)
        if haskey(dict_char, char)
         dict_char[char] += 1
      else
         dict_char[char] = 1
      end
    end
end


In [None]:
chars = [];
frequency = [];
for (key, value) in dict_char
    push!(chars,key)
    push!(frequency,value)
end
char_df = DataFrame(Dict("Char"=>chars, "n"=>frequency));
sort!(char_df, [:n, :Char], rev=[true, false])
char_df[1:20,:]

In [None]:
bar(char_df[1:10, 1], char_df[1:10, 2], title="char frequency")

### word frequency

#### with stop words

In [None]:
dict_words = lexicon(crps);
words = [];
frequency = [];
for (key, value) in dict_words
    push!(words,key)
    push!(frequency,value)
end
words_df = DataFrame(Dict("Words"=>words, "n"=>frequency));
sort!(words_df, [:n, :Words], rev=[true, false])
words_df[1:20,:]

In [None]:
bar(words_df[1:10, 1], words_df[1:10, 2], title="word frequency with stopwords")

#### without stop words

In [None]:
wc = wordcloud(convert(Vector{String}, words[1:200]), convert(Vector{Float64}, frequency[1:200]), 
    density = 0.1,
    outline = 4,
    linecolor = "purpe",
    colors = :Set1_5,
    angles = (0),
    fonts = "Verdana Bold",
    mask=shape(box, 900, 300, cornerradius=0, color=0.95), 
    minfontsize = 2, maxfontsize = 12)

In [None]:
crps2 = deepcopy(crps);
prepare!(crps2, strip_stopwords)
prepare!(crps2, strip_numbers)
prepare!(crps2, strip_punctuation)
remove_case!(crps2)
remove_words!(crps2, Mystopwords)
update_lexicon!(crps2)
update_inverse_index!(crps2)
crps2

In [None]:
dict_words2 = lexicon(crps2);
words2 = [];
frequency2 = [];
for (key, value) in dict_words2
    push!(words2,key)
    push!(frequency2,value)
end
words_df2 = DataFrame(Dict("Words"=>words2, "n"=>frequency2));
sort!(words_df2, [:n, :Words], rev=[true, false])
words_df2[1:20,:]

In [None]:
bar(words_df2[1:10, 1], words_df[1:10, 2], title="word frequency without stopwords")

In [None]:
wc = wordcloud(convert(Vector{String}, words2[1:200]), convert(Vector{Float64}, frequency2[1:200]), 
    density = 0.1,
    outline = 4,
    linecolor = "purpe",
    colors = :Set1_5,
    angles = (0),
    fonts = "Verdana Bold",
    mask=shape(box, 900, 300, cornerradius=0, color=0.95), 
    minfontsize = 2, maxfontsize = 12)

### bigram

#### with stop words

In [None]:
dict_bigram = Dict{Any, Any}()
for sent in corpus
    doc = StringDocument(lowercase(string(sent)))
    language!(doc, Languages.Portuguese())
    remove_case!(doc)
    prepare!(doc, strip_numbers)
    prepare!(doc, strip_punctuation)
    dict = ngrams(doc, 2) #this function includes 1-gram and 2-grams
    for k in dict
        if ' ' in k[1] # exclude 1-gram
            if haskey(dict_bigram, k[1])
                dict_bigram[k[1]] += k[2]
            else
                dict_bigram[k[1]] = k[2]
            end
        end
    end
end

In [None]:
bigrams = [];
frequency = [];
for (key, value) in dict_bigram
    push!(bigrams,key)
    push!(frequency,value)
end
bigram_df = DataFrame(Dict("Bigram"=>bigrams, "n"=>frequency));
sort!(bigram_df, [:n, :Bigram], rev=[true, false])
bigram_df[1:20,:]

In [None]:
bar(bigram_df[1:10, 1], bigram_df[1:10, 2], title="bigram frequency with stopwords")

In [None]:
wc = wordcloud(convert(Vector{String}, bigrams[1:200]), convert(Vector{Float64}, frequency[1:200]), 
    density = 0.1,
    outline = 4,
    linecolor = "purpe",
    colors = :Set1_5,
    angles = (0),
    fonts = "Verdana Bold",
    mask=shape(box, 900, 300, cornerradius=0, color=0.95), 
    minfontsize = 2, maxfontsize = 12)

#### without stop words

In [None]:
dict_bigram = Dict{Any, Any}()
for sent in corpus
    doc = StringDocument(lowercase(string(sent)))
    language!(doc, Languages.Portuguese())
    remove_case!(doc)
    prepare!(doc, strip_numbers)
    prepare!(doc, strip_punctuation)
    prepare!(doc, strip_stopwords)
    remove_words!(doc, Mystopwords)
    dict = ngrams(doc, 2) #this function includes 1-gram and 2-grams
    for k in dict
        if ' ' in k[1] # exclude 1-gram
            if haskey(dict_bigram, k[1])
                dict_bigram[k[1]] += k[2]
            else
                dict_bigram[k[1]] = k[2]
            end
        end
    end
end

In [None]:
bigrams2 = [];
frequency2 = [];
for (key, value) in dict_bigram
    push!(bigrams2,key)
    push!(frequency2,value)
end
bigram_df = DataFrame(Dict("Bigram"=>bigrams2, "n"=>frequency2));
sort!(bigram_df, [:n, :Bigram], rev=[true, false])
bigram_df[1:20,:]

In [None]:
bar(bigram_df[1:10, 1], bigram_df[1:10, 2], title="bigram frequency without stopwords")

In [None]:
wc = wordcloud(convert(Vector{String}, bigrams2[1:200]), convert(Vector{Float64}, frequency2[1:200]), 
    density = 0.1,
    outline = 4,
    linecolor = "purpe",
    colors = :Set1_5,
    angles = (0),
    fonts = "Verdana Bold",
    mask=shape(box, 900, 300, cornerradius=0, color=0.95), 
    minfontsize = 2, maxfontsize = 12)

### trigram

#### with stop words

In [None]:
dict_trigram = Dict{Any, Any}()
for sent in corpus
    doc = StringDocument(lowercase(string(sent)))
    language!(doc, Languages.Portuguese())
    remove_case!(doc)
    prepare!(doc, strip_numbers)
    prepare!(doc, strip_punctuation)
    dict = ngrams(doc, 3) #this function includes 1-gram, 2-grams and 3-grams
    for k in dict
        if count( c-> (c == ' ') , k[1]) == 2 # exclude 1-gram and 2-gram
            if haskey(dict_trigram, k[1])
                dict_trigram[k[1]] += k[2]
            else
                dict_trigram[k[1]] = k[2]
            end
        end
    end
end

In [None]:
trigrams = [];
frequency = [];
for (key, value) in dict_trigram
    push!(trigrams,key)
    push!(frequency,value)
end
trigram_df = DataFrame(Dict("Trigram"=>trigrams, "n"=>frequency));
sort!(trigram_df, [:n, :Trigram], rev=[true, false])
trigram_df[1:20,:]

In [None]:
bar(trigram_df[1:10, 1], trigram_df[1:10, 2], title="trigram frequency with stopwords")

In [None]:
wc = wordcloud(convert(Vector{String}, trigrams[1:200]), convert(Vector{Float64}, frequency[1:200]), 
    density = 0.1,
    outline = 4,
    linecolor = "purpe",
    colors = :Set1_5,
    angles = (0),
    fonts = "Verdana Bold",
    mask=shape(box, 900, 300, cornerradius=0, color=0.95), 
    minfontsize = 2, maxfontsize = 12)

#### without stop words

In [None]:
dict_trigram = Dict{Any, Any}()
for sent in corpus
    doc = StringDocument(lowercase(string(sent)))
    language!(doc, Languages.Portuguese())
    remove_case!(doc)
    prepare!(doc, strip_numbers)
    prepare!(doc, strip_punctuation)
    prepare!(doc, strip_stopwords)
    remove_words!(doc, Mystopwords)
    dict = ngrams(doc, 3) #this function includes 1-gram, 2-grams and 3-grams
    for k in dict
        if count( c-> (c == ' ') , k[1]) == 2 # exclude 1-gram and 2-gram
            if haskey(dict_trigram, k[1])
                dict_trigram[k[1]] += k[2]
            else
                dict_trigram[k[1]] = k[2]
            end
        end
    end
end

In [None]:
trigrams2 = [];
frequency2 = [];
for (key, value) in dict_trigram
    push!(trigrams2,key)
    push!(frequency2,value)
end
trigram_df = DataFrame(Dict("Trigram"=>trigrams2, "n"=>frequency2));
sort!(trigram_df, [:n, :Trigram], rev=[true, false])
trigram_df[1:20,:]

In [None]:
bar(trigram_df[1:10, 1], trigram_df[1:10, 2], title="trigram frequency without stopwords")

In [None]:
wc = wordcloud(convert(Vector{String}, trigrams2[1:200]), convert(Vector{Float64}, frequency2[1:200]), 
    density = 0.1,
    outline = 4,
    linecolor = "purpe",
    colors = :Set1_5,
    angles = (0),
    fonts = "Verdana Bold",
    mask=shape(box, 900, 300, cornerradius=0, color=0.95), 
    minfontsize = 2, maxfontsize = 12)