# Imports and package installation

In [1]:
# imports 
from pydub import AudioSegment
from google.cloud import speech
from huggingsound import SpeechRecognitionModel
from transformers import AutoTokenizer, Wav2Vec2ForCTC
import os
import io
import pandas as pd
import IPython

%load_ext autoreload

%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
#installs
#!pip install pydub
#!pip install leia
#!pip install nltk
#!pip install hurggingsound
#!pip install vaderSentiment
#!pip install transformers
#!pip install spacy
#!pip install openai
#!pip install python-dotenv
#!pip install googletrans==3.1.0a0
#install ffmpeg libary for audio conversion sudo apt-get install ffmpeg

# Data Import and processing

In [3]:
#import tsv with data
text_df = pd.read_csv('../raw_data/commonvoicedataset/validated.tsv', sep='\t', header=0, low_memory=False)

In [4]:
#check df
text_df.head(2)

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accents,locale,segment
0,09e8441e62e3c8da70b667874fa75e96731f6e43e359a1...,common_voice_pt_27283586.mp3,"Se esta primeira condição for satisfeita, é se...",2,0,,,,pt,
1,12baee49ea5072cfd8392765aeb6d0e518a51a23224aa8...,common_voice_pt_25643625.mp3,Nós temos tempo suficiente.,2,1,,,,pt,


In [5]:
# create dict with samples file name and sentence
sample = {}
for p, s in text_df[['path','sentence']].iloc[0:100].itertuples(index=False):
    sample[p] = s

In [6]:
#convert files to wav
for file in sample:
    sound = AudioSegment.from_mp3(f"../raw_data/commonvoicedataset/clips/{file}")
    sound = sound.set_frame_rate(16000)
    sound.export(f"../raw_data/{file[:-4]}.wav", format="wav")

In [7]:
# save audios names
audio_names = list(sample.keys())

In [8]:
#create paths to audio
audio_paths = []
for name in audio_names:
    audio_paths.append(f"../raw_data/{name[:-4]}.wav")

# Wav2vec Models for transcribing audio

In [None]:
#use Wav2Vec model from huggingsound
model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-portuguese")

transcriptions = model.transcribe(audio_paths)

04/09/2022 15:14:42 - INFO - huggingsound.speech_recognition.model - Loading model...


 29%|▎| 29/100 [00:33<02:00,  1.69s/

In [None]:
#create list of transcriptions results
transcriptions_list = []
for transcript in transcriptions:
    transcriptions_list.append(transcript['transcription'])

In [None]:
#create sample dataframe with filename and sentence
sample_df = pd.DataFrame(list(sample.items()), columns=['file_name', 'sentence'])

In [None]:
#add transcribed sentence list
sample_df['transc_sentence'] = transcriptions_list

In [None]:
#check sample df
sample_df.head()

In [None]:
#check true sentences vs transcribed sentences 
for i, j in sample_df.iterrows():
    print(i, j)

In [None]:
#test another model
  
tokenizer = AutoTokenizer.from_pretrained("Edresson/wav2vec2-large-xlsr-coraa-portuguese")
 
model_2 = SpeechRecognitionModel("Edresson/wav2vec2-large-xlsr-coraa-portuguese")

In [None]:
transcriptions_2 = model_2.transcribe(audio_paths)

## Comparing transcriptions to actual sentences

In [None]:
#create list of transcriptions results
transcriptions_list_2 = []
for transcript in transcriptions_2:
    transcriptions_list_2.append(transcript['transcription'])
    
#create sample dataframe with filename and sentence
sample_df_2 = pd.DataFrame(list(sample.items()), columns=['file_name', 'sentence'])

#add transcribed sentence list
sample_df_2['transc_sentence'] = transcriptions_list_2

#check true sentences vs transcribed sentences 
for i, j in sample_df_2.iterrows():
    print(i, j)

## Tests with sample audios from WhatsApp

In [None]:
#import audio from Senna speech
senna_sound = AudioSegment.from_ogg("../raw_data/senna.ogg")
senna_sound = senna_sound.set_frame_rate(16000)
senna_sound.export(f"../raw_data/senna.wav", format="wav")

In [None]:
# Transcribe Senna audio with model 1
senna_transc = model.transcribe(["../raw_data/senna.wav"])
senna_transc[0]['transcription']

In [None]:
# Transcribe Senna audio with model 2
senna_transc_2 = model_2.transcribe(["../raw_data/senna.wav"])
senna_transc_2[0]['transcription']

In [None]:
#import sample WhatsApp audio
w_audio = AudioSegment.from_ogg("../raw_data/audio_test.ogg")
w_audio = w_audio.set_frame_rate(16000)
w_audio.export(f"../raw_data/audio_test.wav", format="wav")

In [None]:
# Transcribe sample audio with model 2
w_audio = model_2.transcribe(["../raw_data/audio_test.wav"])
w_audio[0]['transcription']

# LDA Approach to summarization

## Text pre-processing

In [None]:
#clean text

from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

#remove stop words

stop_words = stopwords.words('portuguese')

def rmv_sw(text):
    word_tokens = word_tokenize(text) 
    text = [w for w in word_tokens if not w in stop_words] 
    return ' '.join(text)

texto_audio_w =  rmv_sw(w_audio[0]['transcription'])

texto_senna = rmv_sw(senna_transc_2[0]['transcription'])
texto_senna

## LDA Model

In [None]:
# Using LDA to extract main topics from text

from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer

def print_topics(texto):
    
    vectorizer = TfidfVectorizer(max_df = 2).fit([texto])

    data_vectorized = vectorizer.transform([texto])

    lda_model = LatentDirichletAllocation(n_components=1).fit(data_vectorized)
    
    for idx, topic in enumerate(lda_model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names_out()[i], topic[i])
                        for i in topic.argsort()[:-10 - 1:-1]])
    

In [None]:
#print topics from texto_senna

print_topics(texto_senna)

In [None]:
#print topics from auido_w

print_topics(texto_audio_w)

# Sentiment analysis

In [None]:
#Sentiment analysis

#Use portuguese sentiment analysis lib

from libs.leia.leia import SentimentIntensityAnalyzer

s = SentimentIntensityAnalyzer()

#test sentiment analysis for each audio

lista = list(sample_df['sentence'])

for frase in lista:
    print(frase, s.polarity_scores(frase))

# Spacy library approach to text pre processing

In [None]:
#use this in bash to download portuguese pipeline: python -m spacy download pt_core_news_lg

In [None]:
#import spacy and load portuguese model

import spacy

nlp = spacy.load("pt_core_news_lg")

In [None]:
#save text as spacy object
texto_completo = nlp(senna_transc_2[0]['transcription'])

In [None]:
#extract tokens from text

for token in texto_completo:
    print (token, token.idx)

In [None]:
#save portuguese stopwords

spacy_stopwords = spacy.lang.pt.stop_words.STOP_WORDS

In [None]:
#remove stopwords from text

for token in texto_completo:
    if not token.is_stop:
        print (token)

In [None]:
#lemmatize words

for token in texto_completo:
    print (token, token.lemma_)

In [None]:
# Part of Speech Tagging

for token in texto_completo:
    print (token, token.tag_, token.pos_, spacy.explain(token.tag_))

In [None]:
#function to extract full name from text

from spacy.matcher import Matcher

matcher = Matcher(nlp.vocab)

def extract_full_name(nlp_doc):
    pattern = [[{'POS': 'PROPN'}, {'POS': 'PROPN'}]]
    matcher.add('FULL_NAME', pattern)
    matches = matcher(nlp_doc)
    for match_id, start, end in matches:
        span = nlp_doc[start:end]
        return span.text

In [None]:
texto_test = nlp('''Testando capacidade do spacy de detectar nome dentro 
                 de um texto, Alexandre Carvalho testando pra ver se funciona''')

extract_full_name(texto_test)

In [None]:
# displacy Visualization

from spacy import displacy

#displacy.serve(texto_completo, style='dep')

In [None]:
from heapq import nlargest
from spacy.lang.pt.stop_words import STOP_WORDS

#function for counting word frequency

def word_freq(text, count):
    nlp = spacy.load('pt_core_news_lg')
    doc= nlp(text)
    tokens=[token.text for token in doc]
    word_frequencies={}
    for word in doc:
        if word.text.lower() not in list(STOP_WORDS):
                if word.text not in word_frequencies.keys():
                    word_frequencies[word.text] = 1
                else:
                    word_frequencies[word.text] += 1
    max_frequency=max(word_frequencies.values())
    for word in word_frequencies.keys():
        word_frequencies[word]=word_frequencies[word]/max_frequency
    return nlargest(count, word_frequencies)

In [None]:
#getting words frequencies

nlp = spacy.load('pt_core_news_lg')
doc= nlp(texto_completo)
tokens=[token.text for token in doc]
word_frequencies={}
for word in doc:
    if word.text.lower() not in list(STOP_WORDS):
            if word.text not in word_frequencies.keys():
                word_frequencies[word.text] = 1
            else:
                word_frequencies[word.text] += 1
max_frequency=max(word_frequencies.values())
for word in word_frequencies.keys():
    word_frequencies[word]=word_frequencies[word]/max_frequency

In [None]:
word_freq(texto_completo, 3)

In [None]:
word_frequencies

# Tests with GPT-3

In [None]:
import os
from dotenv import load_dotenv
import openai

load_dotenv(dotenv_path="/home/alexandre/code/alexandrecastello/hear2me/notebooks/openai.env")
openai.api_key = os.environ.get('OPENAI_KEY')
completion = openai.Completion()

In [None]:
def text_analysis(text):
    response = openai.Completion.create(
  engine="text-davinci-002",
  prompt=f"Summarize and return the sentiment of the following text:{texto}",
  temperature=0.7,
  max_tokens=60,
  top_p=1.0,
  frequency_penalty=0.0,
  presence_penalty=0.0
)
    answer = response.choices[0].text.strip()
    return answer

In [None]:
summarize("Boa Tarde. Estamos com a esperança de que Abril seja o inicio da recuperação do mercado. Ja estamos sentindo bastante necessidade de comprar coisas novas. A principio estamos priorizando os fornecedores que vendem a prazo, enquanto isso estamos tentando viabilizar mais investimento em novidades. Os primeiros 3 meses do ano foi bastante complicado.")

In [None]:
IPython.display.Audio("../raw_data/senna.wav")

In [None]:
#text summarization with full text
senna_sum = summarize(senna_transc_2[0]['transcription'])
senna_sum

In [None]:
#text summarization with clean text
summarize(texto_senna)

In [None]:
IPython.display.Audio("../raw_data/audio_test.wav")

In [None]:
w_audio[0]['transcription']

In [None]:
#text summarization with full text
audio_w_sum = summarize(w_audio[0]['transcription'])
audio_w_sum

In [None]:
#text summarization with clean text
audio_w_sum = summarize(texto_audio_w)
audio_w_sum

In [None]:
from googletrans import Translator

translator = Translator()
result = translator.translate(audio_w_sum, dest='pt').text
result

# Test functions

In [None]:
# importing sys
import sys
  
# adding Folder to the system path
sys.path.insert(0, '/home/alexandre/code/alexandrecastello/hear2me/')

from transcriber import load_model, transcribe
from gpt3 import text_analysis, translate

model = load_model()
transcribed_text = transcribe("../raw_data/senna.wav")

In [None]:
analysis = text_analysis(transcribed_text)
translated_text = translate(analysis)
translated_text

# To Do

In [None]:
# Treinar word2vec na lista de palavras do stopwords pra poder aproximar palavras que não estão presentes na lista