In [None]:
import re
import nltk
import spacy
import os
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import wordcloud
from collections import Counter

%matplotlib inline
plt.rcParams["figure.figsize"] = (15,8)

In [None]:
listAtas = os.listdir("../atas")

In [None]:
corpus = []

for ata in listAtas:
    with open("../atas/" + ata,'rt', encoding='utf-8') as f:
        lines = f.readlines()
        if lines: 
            lines = ' '.join(lines)
            corpus.append(lines)

print(len(corpus),"atas")

# Corpus Characteristics 

In [None]:
%%time
corpusJoined = ' '.join(corpus)
corpusJoinedWithoutPunctuation =  re.sub(r'[^\w\s]','',corpusJoined)
corupsWordTokenized = nltk.word_tokenize(corpusJoined)
corupsWordTokenizedWithoutPunctuation = nltk.word_tokenize(corpusJoinedWithoutPunctuation)
corpusJoinedWithoutSpaces = re.sub(' ','', corpusJoined)
corpusSentences = nltk.tokenize.sent_tokenize(corpusJoined)

In [None]:
print("Number of characters with spaces: ", len(corpusJoined))
print("Number of characters without spaces: ", len(corpusJoinedWithoutSpaces))
print("Number of words: ",len(corupsWordTokenizedWithoutPunctuation))
print("Number of sentences: ", len(corpusSentences))
print("Number of characters per words: ", len(corpusJoinedWithoutSpaces)/len(corupsWordTokenizedWithoutPunctuation))
print("Number of words per sentence: ", len(corupsWordTokenizedWithoutPunctuation)/len(corpusSentences))

# Frequencies 

In [None]:
corpusJoined = corpusJoined.lower()
for i in range(0,len(corpus)):
    corpus[i]=corpus[i].lower()
    corpus[i] = re.sub('\n','',corpus[i]) #remove newline character

In [None]:
Mystopwords = ['ainda','ante','p','r','sobre'] + ['janeiro','fevereiro','março','abril','maio','junho','julho','agosto','setembro','outubro','novembro','dezembro','mês','meses','ano','anos'] + [str(i) for i in range(10)] + nltk.corpus.stopwords.words('portuguese')


## Character frequency 

In [None]:
charCountVect = CountVectorizer(analyzer='char')
charCountVect.fit(corpus);

In [None]:
bagOfChar = charCountVect.transform(corpus)
sumChars = bagOfChar.sum(axis=0)
charsFreq = [(char, sumChars[0, idx]) for char, idx in charCountVect.vocabulary_.items()]
charsFreq =sorted(charsFreq, key = lambda x: x[1], reverse=True)

In [None]:
numberOfChars = 20
yPos = np.arange(numberOfChars)
objects = []
performance = []
for i in range(numberOfChars):
    aux = charsFreq[i]
    objects.append(aux[0])
    performance.append(aux[1])

In [None]:
#horizontal bars
plt.barh(yPos, performance, align='center', alpha=0.5)
plt.yticks(yPos, objects)
plt.xlabel('Frequency')
plt.ylabel('Characters')
plt.title('Character Frequency')
plt.show()


## Word Frequency

In [None]:
def frequencyPlot(listText, number_of_words=20, stopwords=None, ngramRange=(1, 1), vocabulary=None):
    count_vect = CountVectorizer(
        analyzer='word',
        stop_words=stopwords,
        ngram_range=ngramRange,
        vocabulary=vocabulary
    )
    count_vect.fit(listText)
    bag_of_words = count_vect.transform(listText)
    sum_words = bag_of_words.sum(axis=0)
    word_freq = [(word, sum_words[0, idx])
                 for word, idx in count_vect.vocabulary_.items()]
    word_freq = sorted(word_freq, key=lambda x: x[1], reverse=True)
    y_pos = np.arange(number_of_words)
    objects = []
    performance = []
    for i in range(number_of_words):
        aux = word_freq[i]
        objects.append(aux[0])
        performance.append(aux[1])
    plt.barh(yPos, performance, align='center', alpha=0.5)
    plt.yticks(yPos, objects)
    plt.xlabel('Frequency')
    plt.ylabel('Tokens')
    plt.title('Frequency of tokens')
    plt.show()


In [None]:
def wordcloudPlot(text, stopwords=None, max_font_size=50, max_words=100, background_color="white"):
    cloud = wordcloud.WordCloud(stopwords=stopwords, max_font_size=max_font_size,
                                max_words=max_words, background_color=background_color).generate(text.lower())

    # Display the generated image:
    plt.imshow(cloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

In [None]:
# number of words in the barplots
numberOfWords = 20

### With stop words

In [None]:
frequencyPlot(corpus, numberOfWords, stopwords=None, ngramRange=(1, 1))

### Wordcloud with stop words

In [None]:
wordcloudPlot(corpusJoined, stopwords=None)

### Without stop words

In [None]:
frequencyPlot(corpus, numberOfWords, stopwords=Mystopwords, ngramRange=(1, 1))

### Wordcloud without stop words

In [None]:
wordcloudPlot(corpusJoined, stopwords=Mystopwords)

## Bigram Frequency
 

### With stop words

In [None]:
frequencyPlot(corpus, numberOfWords, stopwords=None, ngramRange=(2, 2))

### Without stopwords

In [None]:
frequencyPlot(corpus, numberOfWords, stopwords=Mystopwords, ngramRange=(2, 2))

## Trigram Frequency

### With stop words

In [None]:
frequencyPlot(corpus, numberOfWords, stopwords=None, ngramRange=(3, 3))

### Without stop words

In [None]:
frequencyPlot(corpus, numberOfWords, stopwords=Mystopwords, ngramRange=(3, 3))

# Parsing

In [None]:
nlp = spacy.load('pt_core_news_sm')

In [None]:
%%time
text = []
pos = []
tag = []
dep = []
ent_text = []
ent_label = []
for ata in corpus:
    doc = nlp(ata)
    for token in doc:
        text.append(token.text)
        pos.append(token.pos_)
        tag.append(str(token.morph))
        dep.append(token.dep_)
    for ent in doc.ents:
        ent_text.append(ent.text)
        ent_label.append(ent.label_)

In [None]:
df = pd.DataFrame(list(zip(text, pos)), 
               columns =['word', 'pos'])

## nouns

In [None]:
df[ (df['pos'] == 'PROPN') | (df['pos'] == 'NOUN')].loc[:,'word'].value_counts()[:20].plot.bar(rot=45);

## adjective 

In [None]:
df[df['pos'] == 'ADJ'].loc[:,'word'].value_counts()[:20].plot.bar(rot=45);

## verb

In [None]:
df[ (df['pos'] == 'AUX') | (df['pos'] == 'VERB')].loc[:,'word'].value_counts()[:20].plot.bar(rot=45);

## adverb

In [None]:
df[df['pos'] == 'ADV'].loc[:,'word'].value_counts()[:20].plot.bar(rot=45);

## conjunction

In [None]:
df[ (df['pos'] == 'CONJ') | (df['pos'] == 'CCONJ') | (df['pos'] == 'SCONJ')].loc[:,'word'].value_counts()[:10].plot.bar(rot=45);

## punctuation

In [None]:
df[df['pos'] == 'PUNCT'].loc[:,'word'].value_counts()[:5].plot.bar(rot=45);

## determiner

In [None]:
df[df['pos'] == 'DET'].loc[:,'word'].value_counts()[:10].plot.bar(rot=45);

## pronoun

In [None]:
df[df['pos'] == 'PRON'].loc[:,'word'].value_counts()[:10].plot.bar(rot=45);

## numbers

In [None]:
df[df['pos'] == 'NUM'].loc[:,'word'].value_counts()[:10].plot.bar(rot=45);

## symbol

In [None]:
df[df['pos'] == 'SYM'].loc[:,'word'].value_counts()[:5].plot.bar(rot=45);

## part-Of-Speech 

In [None]:
pos_ = Counter(pos)
pos_ = {key: val for key, val in sorted(pos_.items(), key = lambda x: x[1], reverse = True)}
plt.bar(range(len(pos_)), list(pos_.values()), align='center')
plt.xticks(range(len(pos_)), list(pos_.keys()),rotation = 45)
plt.show()

## morphology tags

In [None]:
tag_ = {key: val for key, val in Counter(tag).most_common(10)}
plt.bar(range(len(tag_)), list(tag_.values()), align='center')
plt.xticks(range(len(tag_)), list(tag_.keys()),rotation = 60)
plt.show()

## dependencies

In [None]:
dep_ = Counter(dep)
dep_ = {key: val for key, val in sorted(dep_.items(), key = lambda x: x[1], reverse = True)}
plt.bar(range(len(dep_)), list(dep_.values()), align='center')
plt.xticks(range(len(dep_)), list(dep_.keys()),rotation = 45)
plt.show()

## entities

### texts

In [None]:
ent_ = {x : y for x, y in  Counter(ent_text).most_common(15)} 
plt.bar(range(len(ent_)), list(ent_.values()), align='center')
plt.xticks(range(len(ent_)), list(ent_.keys()),rotation = 45)
plt.show()

### labels

In [None]:
ent_ = {x : y for x, y in  Counter(ent_label).most_common(5)} 
plt.bar(range(len(ent_)), list(ent_.values()), align='center')
plt.xticks(range(len(ent_)), list(ent_.keys()),rotation = 45)
plt.show()