In [None]:
#############################################################################################################
##### Notebook Processamento de Linguagem natural (PLN)
##### Baseado em:
## Natural Language Processing with Python (book)
##
##############################################################################################################
## Objetivos:
##   Mostrar varios metodos de linguagem natural utilizando Python
###################################################################################################################

In [None]:
################################################
### 01 - Tokenizacao
################################################

from nltk.tokenize import WhitespaceTokenizer
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import TreebankWordTokenizer

text = "This is Mary's car, isn't it?"
tk_list = []
tk_list.append(WhitespaceTokenizer()) 
tk_list.append(WordPunctTokenizer())
tk_list.append(TreebankWordTokenizer())

for tk in tk_list:
    result = tk.tokenize(text) 
    print(result)


In [None]:
### Tokenizacao em portugues

import nltk
from nltk import tokenize

text = "guarda-chuva Se a única coisa que de o homem terá certeza é a morte; a única certeza do brasileiro é o carnaval no próximo ano." # Graciliano Ramos

result = tokenize.word_tokenize(text, language='portuguese') 

print(result)

In [None]:
################################################
### 02 - Stemming
################################################

import nltk
from nltk.stem import PorterStemmer

ps = PorterStemmer()

example_words = ["program","programming","programer","programs","programmed"]

# Perform stemming
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))

In [None]:
example_words = ["programmers", "because", "people"]
print("{0:20}{1:20}".format("--Word--","--Stem--"))
for word in example_words:
   print ("{0:20}{1:20}".format(word, ps.stem(word)))


In [None]:
################################################
### 03 - Lemmanization
################################################

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wnl = WordNetLemmatizer()
words = ['dogs', 'churches', 'aardwolves', 'abaci', 'hardrock']

for w in words:
    print(wnl.lemmatize(w))

In [None]:
################################################
### 04 - Stopwords
################################################

#nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')
print(stopwords[:15])
len(stopwords)


In [None]:
################################################
### 04 - Caracterizacao das palavras - Tfidf
################################################

from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
texts = ["bad movie", "not a good movie", "did not like", "i like it", "good me"]

tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1 ,2))
features = tfidf.fit_transform(texts)
df = pd.DataFrame(
    features.todense(),
    columns=tfidf.get_feature_names_out()
)
print(df)

In [None]:
################################################
### Extra - Lembrando como fazer hashing
################################################

import hashlib

def hash_token(token, b):
    hash_object = hashlib.sha256()
    hash_object.update(token.encode()) # UTF-8 encode
    return int(hash_object.hexdigest(), 16) % (2**b)

# Example usage
b = 10  # Number of buckets for hashing
token = "dfadfasdfasdfasdfadfadfadfasdfasdfasdfa"
hashed_value = hash_token(token, b)
print(hashed_value)


In [None]:
################################################
### 05 - Caracterizacao das palavras - wordvector
################################################

from gensim.models import Word2Vec

# Sentencas
sentences = [["gato", "persegue", "rato"], ["cachorro", "late", "muito"], ["lobo", "uiva"]]

# Treinamento do modelo Word2Vec
model = Word2Vec(sentences, min_count=1) # ignora palavras com frequencia abaixo de...

vector = model.wv['gato']
print("Vetor representando 'gato':", vector)

# Find similar words
similar_words = model.wv.most_similar('gato')
print("Similaridade das palavras em relacao a 'gato 'gato':", similar_words)


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import nltk
#nltk.download('punkt')

# Sample corpus
corpus = [
    "I like to eat apples",
    "I like bananas",
    "I enjoy eating oranges"
]

# Tokenize the corpus
tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in corpus]

# Train the Word2Vec model
model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, sg=0)

# Get the word vector for a word
word_vector = model.wv["apples"]
print("Vector for 'apples':", word_vector)

# Find similar words
similar_words = model.wv.most_similar("apples")
print("Similar words to 'apples':", similar_words)

In [None]:
##################################################################################################################
##################################################################################################################
##################################################################################################################

In [None]:
def readData():
    data = ['This is a  dog','This is a cat','I love my cat','This is my name ']
    dat=[]
    for i in range(len(data)):
        for word in data[i].split():
            dat.append(word)
    print(dat)
    return dat

def createBigram(data):
   listOfBigrams = []
   bigramCounts = {}
   unigramCounts = {}
   for i in range(len(data)-1):
      if i < len(data) - 1 and data[i+1].islower():

         listOfBigrams.append((data[i], data[i + 1]))

         if (data[i], data[i+1]) in bigramCounts:
            bigramCounts[(data[i], data[i + 1])] += 1
         else:
            bigramCounts[(data[i], data[i + 1])] = 1

      if data[i] in unigramCounts:
         unigramCounts[data[i]] += 1
      else:
         unigramCounts[data[i]] = 1
   return listOfBigrams, unigramCounts, bigramCounts


def calcBigramProb(listOfBigrams, unigramCounts, bigramCounts):
    listOfProb = {}
    for bigram in listOfBigrams:
        word1 = bigram[0]
        word2 = bigram[1]
        listOfProb[bigram] = (bigramCounts.get(bigram))/(unigramCounts.get(word1))
    return listOfProb


if __name__ == '__main__':
    data = readData()
    listOfBigrams, unigramCounts, bigramCounts = createBigram(data)

    print("\n All the possible Bigrams are ")
    print(listOfBigrams)

    print("\n Bigrams along with their frequency ")
    print(bigramCounts)

    print("\n Unigrams along with their frequency ")
    print(unigramCounts)

    bigramProb = calcBigramProb(listOfBigrams, unigramCounts, bigramCounts)

    print("\n Bigrams along with their probability ")
    print(bigramProb)
    inputList="This is my cat"
    splt=inputList.split()
    outputProb1 = 1
    bilist=[]
    bigrm=[]

    for i in range(len(splt) - 1):
        if i < len(splt) - 1:

            bilist.append((splt[i], splt[i + 1]))

    print("\n The bigrams in given sentence are ")
    print(bilist)
    for i in range(len(bilist)):
        if bilist[i] in bigramProb:

            outputProb1 *= bigramProb[bilist[i]]
        else:

            outputProb1 *= 0
    print('\n' + 'Probablility of sentence \"This is my cat\" = ' + str(outputProb1))

In [None]:
import joblib
from nltk import word_tokenize

folder = 'C:/Users/dealbuqc/Desktop/UFPB/Classes/SBC/'
teste_tagger = joblib.load(folder+'POS_tagger_bigram2.pkl')
phrase = 'O rato roeu a roupa do rei de Roma'
teste_tagger.tag(word_tokenize(phrase))


In [None]:
import nltk
from nltk import word_tokenize
import numpy as np
import joblib

nltk.download('mac_morpho')

In [None]:
dataset = list(nltk.corpus.mac_morpho.tagged_sents())

In [None]:
dataset[500]

In [None]:
tot = len(dataset)
tot_train_samples = int(np.ceil(tot*.8))

train_data = dataset[:tot_train_samples]
test_data = dataset[tot_train_samples:]

In [None]:
t_def = nltk.DefaultTagger('N')
t_affix2 = nltk.AffixTagger(train_data, affix_length=-2, backoff=t_def)
t_affix3 = nltk.AffixTagger(train_data, affix_length=-3, backoff=t_affix2)
t_affix4 = nltk.AffixTagger(train_data, affix_length=-4, backoff=t_affix3)
t_affix5 = nltk.AffixTagger(train_data, affix_length=-5, backoff=t_affix4)
t_affix6 = nltk.AffixTagger(train_data, affix_length=-6, backoff=t_affix5)

acc_def = t_def.evaluate(test_data) * 100
acc_af2 = t_affix2.evaluate(test_data) * 100
acc_af3 = t_affix3.evaluate(test_data) * 100
acc_af4 = t_affix4.evaluate(test_data) * 100
acc_af5 = t_affix5.evaluate(test_data) * 100
acc_af6 = t_affix6.evaluate(test_data) * 100

print('''Performance dos taggers:
         - Default:                     {:.2f}%
         - Sufixo tamanho 2 + Default:  {:.2f}%
         - Sufixo tamanho 3 + Sufixo 2: {:.2f}%
         - Sufixo tamanho 4 + Sufixo 3: {:.2f}%
         - Sufixo tamanho 5 + Sufixo 4: {:.2f}%
         - Sufixo tamanho 6 + Sufixo 5: {:.2f}%'''.format(acc_def, acc_af2, acc_af3,
                                                          acc_af4, acc_af5, acc_af6))

In [None]:
t_def = nltk.DefaultTagger('N')
t_affix2 = nltk.AffixTagger(train_data, affix_length=-2, backoff=t_def)
t_affix3 = nltk.AffixTagger(train_data, affix_length=-3, backoff=t_affix2)
t_affix4 = nltk.AffixTagger(train_data, affix_length=-4, backoff=t_affix3)
t_affix5 = nltk.AffixTagger(train_data, affix_length=-5, backoff=t_affix4)
t_affix6 = nltk.AffixTagger(train_data, affix_length=-6, backoff=t_affix5)

acc_def = t_def.evaluate(test_data) * 100
acc_af2 = t_affix2.evaluate(test_data) * 100
acc_af3 = t_affix3.evaluate(test_data) * 100
acc_af4 = t_affix4.evaluate(test_data) * 100
acc_af5 = t_affix5.evaluate(test_data) * 100
acc_af6 = t_affix6.evaluate(test_data) * 100

print('''Performance dos taggers:
         - Default:                     {:.2f}%
         - Sufixo tamanho 2 + Default:  {:.2f}%
         - Sufixo tamanho 3 + Sufixo 2: {:.2f}%
         - Sufixo tamanho 4 + Sufixo 3: {:.2f}%
         - Sufixo tamanho 5 + Sufixo 4: {:.2f}%
         - Sufixo tamanho 6 + Sufixo 5: {:.2f}%'''.format(acc_def, acc_af2, acc_af3,
                                                          acc_af4, acc_af5, acc_af6))

In [None]:
t_uni = nltk.UnigramTagger(train_data, backoff=t_affix5)

acc_uni = t_uni.evaluate(test_data) * 100

print('''Performance dos taggers:
         - Default:                     {:.2f}%
         - Sufixo tamanho 2 + Default:  {:.2f}%
         - Sufixo tamanho 3 + Sufixo 2: {:.2f}%
         - Sufixo tamanho 4 + Sufixo 3: {:.2f}%
         - Sufixo tamanho 5 + Sufixo 4: {:.2f}%
         - Sufixo tamanho 6 + Sufixo 5: {:.2f}%
         - Unigrama + Sufixo 6:         {:.2f}%'''.format(acc_def, acc_af2, acc_af3,
                                                          acc_af4, acc_af5, acc_af6,
                                                          acc_uni))

In [None]:
t_bi = nltk.BigramTagger(train_data, backoff=t_uni)
t_tri = nltk.TrigramTagger(train_data, backoff=t_bi)

acc_bi = t_bi.evaluate(test_data) * 100
acc_tri = t_tri.evaluate(test_data) * 100

print('''Performance dos taggers:
         - Default:                     {:.2f}%
         - Sufixo tamanho 2 + Default:  {:.2f}%
         - Sufixo tamanho 3 + Sufixo 2: {:.2f}%
         - Sufixo tamanho 4 + Sufixo 3: {:.2f}%
         - Sufixo tamanho 5 + Sufixo 4: {:.2f}%
         - Sufixo tamanho 6 + Sufixo 5: {:.2f}%
         - Unigrama + Sufixo 6:         {:.2f}%
         - Bigrama + Unigrama:          {:.2f}%
         - Trigrama + Bigrama:          {:.2f}%'''.format(acc_def, acc_af2, acc_af3,
                                                          acc_af4, acc_af5, acc_af6,
                                                          acc_uni, acc_bi, acc_tri))

In [None]:
import pickle
pickle.dump(t_bi, open(folder+'POS_tagger_bigram2.pkl', 'wb'))

In [None]:
#---

def viterbi(obs, states, start_p, trans_p, emit_p):
    V = [{}]
    for st in states:
        V[0][st] = {"prob": start_p[st] * emit_p[st][obs[0]], "prev": None}
    for t in range(1, len(obs)):
        V.append({})
        for st in states:
            max_tr_prob = max(V[t-1][prev_st]["prob"]*trans_p[prev_st][st] for prev_st in states)
            for prev_st in states:
                if V[t-1][prev_st]["prob"]*trans_p[prev_st][st] == max_tr_prob:
                    max_prob = max_tr_prob * emit_p[st][obs[t]]
                    V[t][st] = {"prob": max_prob, "prev": prev_st}
                    break
    return V

In [None]:
states = ('Rainy', 'Sunny')

observations = ('walk', 'shop', 'clean')

start_probability = {'Rainy': 0.6, 'Sunny': 0.4}

transition_probability = {
'Rainy' : {'Rainy': 0.7, 'Sunny': 0.3},
'Sunny' : {'Rainy': 0.4, 'Sunny': 0.6},
}

emission_probability = {
'Rainy' : {'walk': 0.1, 'shop': 0.4, 'clean': 0.5},
'Sunny' : {'walk': 0.6, 'shop': 0.3, 'clean': 0.1},
}



In [None]:
viterbi(observations,
         states,
         start_probability,
         transition_probability,
         emission_probability)

In [None]:
#-----------------
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers
import tensorflow.keras.utils as ku 
import numpy as np

In [None]:
tokenizer = Tokenizer()
data = open('../input/dtspeech/DTSpeech.txt').read()
corpus = data.lower().split("\n")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1