<h2>Código utilizado para experimentação e testes com diferentes métodos e parâmetros para reconhecer padrões de valência e contexto narrativo(assunto) nas letras de músicas</h2>

In [None]:
import BaseDados
import string
import collections
import numpy as np
import math
import json
import datetime
from __future__ import print_function
import warnings
from time import time
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score,confusion_matrix
from nltk.corpus import floresta,stopwords
from nltk import DefaultTagger,UnigramTagger,BigramTagger,pos_tag,word_tokenize
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold,permutation_test_score
import sklearn.base

warnings.filterwarnings('ignore');

In [None]:
def StopWords(incluir = []):
    stp = stopwords.words('portuguese');
    [stp.append(e) for e in string.punctuation];
    [stp.append(e) for e in incluir];
    return stp

#Retira somente a parte principal do part-of-speech (se é verbo, subs, adj, etc.)
def simplify_tag(t):
    if t is None: return '.'
    if "+" in t:
         return simplify_tag(t[t.index("+")+1:])
    if "-" in t:
         return simplify_tag(t[:t.index("-")])
    else:
        return t

#Treina o NLTK para português
train = floresta.tagged_sents()
tagger1 = UnigramTagger(train)
tagger2 = BigramTagger(train, backoff=tagger1)

In [None]:
#Converte em PoS e os vectoriza, para cada letra
def LetrasToPartOfSpeech(Letras):
    ret = []
    for l in [l for l in Letras]:
        tagged = tagger2.tag(word_tokenize(l));
        tagged = [(w.lower(), simplify_tag(t)) for (w,t) in tagged];
        tagged = [(w,t) for (w,t) in tagged if (t not in string.punctuation)];
        tags = [t for (w,t) in tagged ];
        _letra_tags_ = ' '.join(tags);
        ret.append(_letra_tags_);
    return ret


def CountMatrix(lista_texto):
    ret = []
    all_pos = [e[0] for e in collections.Counter((' '.join(lista_texto).split(' '))).items()]
    for l in lista_texto:
        Cnt = collections.Counter(l.split(' '))
        ret.append([Cnt[f]/len(list(Cnt.elements())) for f in all_pos])
    return ret

#Converte as letras em uma matriz representativa
def RepresentarLetras(Letras,
                      Binario,
                      PoS,
                      RemoveStopWords, 
                      n_features, 
                      n_components):
    if RemoveStopWords:
        stopwords = StopWords(['oh','la','lá','ah','alô','aí','nena']);
    else:
        stopwords = [];
    
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                       max_features=n_features,
                                       stop_words=stopwords)
    tfidf = tfidf_vectorizer.fit_transform(Letras)
    
    nmf = NMF(n_components=n_components, random_state=3,
              alpha=.1, l1_ratio=.5).fit(tfidf)
    W = nmf.fit_transform(tfidf)
    
    if Binario:
        W = (W > 0).astype(float)

    if PoS:
        partofspeech = LetrasToPartOfSpeech(Letras)
        partofspeech_columns = CountMatrix(partofspeech);
        W = np.c_[W,partofspeech_columns]
    
    return W

#Testa o modelo de ML, registra os resultados (inclusive com os grupos separados G5 G4 e G3)
def TestarParametros(Dados,
                     idx_g5,
                     idx_g4,
                     idx_g3,
                     Teste,
                     Target, 
                     method,
                     hyperP,
                     kfold,
                     Binario,
                     PoS,
                     RemoveStopWords, 
                     n_features, 
                     n_components):
    if method=='SVC':
        s_main = SVC(C=hyperP);
    elif method=='KNN':
        s_main = KNeighborsClassifier(n_neighbors=hyperP);
    elif method=='RD':
        s_main = RandomForestClassifier(n_estimators=hyperP*10, max_depth=2,
                                  random_state=2);
    elif method=='GNB':
        s_main =  GaussianNB();
    
    Letras = [l.Letra for l in Dados];
    
    W = RepresentarLetras(Letras=Letras,Binario=Binario,PoS=PoS,RemoveStopWords=RemoveStopWords,n_features=n_features,n_components=n_components);
    
    #Ground-Truth separados por grupos de controle
    if Target=='Assunto':
        vetor_spec = [l.Assunto for l in Dados]
    elif Target=='Valência':
        vetor_spec = [l.Valencia for l in Dados]
    
    f1scores = np.array([]); 
    f1scores_g5 = np.array([]); #F1 scores obtidos pelo grupo g5
    f1scores_g4 = np.array([]); #F1 scores obtidos pelo grupo g4
    f1scores_g3 = np.array([]); #F1 scores obtidos pelo grupo g3
    
    W = np.array(W);
    vetor_spec = np.array(vetor_spec);
    cfn_matrices = np.zeros((len(set(vetor_spec)),len(set(vetor_spec))));
    cfn_matrices_g5 = np.zeros((len(set(vetor_spec)),len(set(vetor_spec))));
    cfn_matrices_g4 = np.zeros((len(set(vetor_spec)),len(set(vetor_spec))));
    cfn_matrices_g3 = np.zeros((len(set(vetor_spec)),len(set(vetor_spec))));
    
    #Verificação por grupo#
    
    ######################################################################  G5
    scores_g5 = np.array([]);
    kfold_g5 = math.trunc(len(idx_g5)/10);
    skf = StratifiedKFold(kfold_g5);
    for invl, test_index in skf.split(np.zeros(len(idx_g5)), vetor_spec[idx_g5]):
        s = sklearn.base.clone(s_main);
        W_local = W;
        idx_g5_test = idx_g5[test_index];
        W_except_g5 = [x for i,x in enumerate(W_local) if i not in idx_g5_test];
        vetor_spec_except_g5 = [x for i,x in enumerate(vetor_spec) if i not in idx_g5_test];
        
        #Normalização
        if (not Binario) or (PoS):
            norm = StandardScaler();
            norm.fit(W_except_g5);
            W_local = norm.transform(W_local);
        
        #Treino Algoritmo
        s.fit(W_except_g5,vetor_spec_except_g5)
        #Predição
        y_true = vetor_spec[idx_g5_test];
        y_pred = s.predict(W_local[idx_g5_test]);
        #Cálculo de F1 e Confusion Matrix
        f1_local = f1_score(y_true, y_pred, average='macro');
        f1scores_g5 =np.append(f1scores_g5,f1_local);      
        scores_g5 =np.append(scores_g5,s.score(W_local[idx_g5_test],vetor_spec[idx_g5_test]));    
        cfn_matrix = confusion_matrix(y_true, y_pred);
        cfn_matrices_g5 += cfn_matrix;
    
    score_g5 = scores_g5.mean();
    ######################################################################  G4
    scores_g4 = np.array([]);
    kfold_g4 = math.trunc(len(idx_g4)/10);
    skf = StratifiedKFold(kfold_g4);
    for invl, test_index in skf.split(np.zeros(len(idx_g4)), vetor_spec[idx_g4]):
        s = sklearn.base.clone(s_main);
        W_local = W;
        idx_g4_test = idx_g4[test_index];
        W_except_g4 = [x for i,x in enumerate(W_local) if i not in idx_g4_test];
        vetor_spec_except_g4 = [x for i,x in enumerate(vetor_spec) if i not in idx_g4_test];
        
        #Normalização
        if (not Binario) or (PoS):
            norm = StandardScaler();
            norm.fit(W_except_g4);
            W_local = norm.transform(W_local);
        
        #Treino Algoritmo
        s.fit(W_except_g4,vetor_spec_except_g4);
        #Predição
        y_true = vetor_spec[idx_g4_test];
        y_pred = s.predict(W_local[idx_g4_test]);
        #Cálculo de F1 e Confusion Matrix
        f1_local = f1_score(y_true, y_pred, average='macro');
        f1scores_g4 =np.append(f1scores_g4, f1_local);
        scores_g4 =np.append(scores_g4,s.score(W_local[idx_g4_test],vetor_spec[idx_g4_test]));            
        cfn_matrix = confusion_matrix(y_true, y_pred);
        cfn_matrices_g4 += cfn_matrix;
        
    score_g4 = scores_g4.mean();
    ######################################################################  G3
    scores_g3 = np.array([]);
    kfold_g3 = math.trunc(len(idx_g3)/10);
    skf = StratifiedKFold(kfold_g3);
    for invl, test_index in skf.split(np.zeros(len(idx_g3)), vetor_spec[idx_g3]):
        s = sklearn.base.clone(s_main);
        W_local = W;
        idx_g3_test = idx_g3[test_index];
        W_except_g3 = [x for i,x in enumerate(W_local) if i not in idx_g3_test];
        vetor_spec_except_g3 = [x for i,x in enumerate(vetor_spec) if i not in idx_g3_test];
        
        #Normalização
        if (not Binario) or (PoS):
            norm = StandardScaler();
            norm.fit(W_except_g3);
            W_local = norm.transform(W_local);
        
        #Treino Algoritmo
        s.fit(W_except_g3,vetor_spec_except_g3)
        #Predição
        y_true = vetor_spec[idx_g3_test];
        y_pred = s.predict(W_local[idx_g3_test]);
        #Cálculo de F1 e Confusion Matrix
        f1_local = f1_score(y_true, y_pred, average='macro');
        f1scores_g3 =np.append(f1scores_g3,f1_local);
        scores_g3 =np.append(scores_g3,s.score(W_local[idx_g3_test],vetor_spec[idx_g3_test]));           
        cfn_matrix = confusion_matrix(y_true, y_pred);
        cfn_matrices_g3 += cfn_matrix;
    
        
    score_g3 = scores_g3.mean();
    ######################################################################  Principal
    scores = np.array([]);
    
    skf = StratifiedKFold(kfold)
    for train_index, test_index in skf.split(W, vetor_spec):
        s = sklearn.base.clone(s_main)
        
        #Normalização
        if (not Binario) or (PoS):
            norm = StandardScaler();
            norm.fit(W);
            W = norm.transform(W);
        
        #Treino Algoritmo
        s.fit(W[train_index],vetor_spec[train_index])
        
        #Predição
        y_true = vetor_spec[test_index];
        y_pred = s.predict(W[test_index]);
        
        #Cálculo de F1 e Confusion Matrix
        f1_local = f1_score(y_true, y_pred, average='macro');
        
        f1scores = np.append(f1scores,f1_local);    
        scores =np.append(scores,s.score(W[test_index],vetor_spec[test_index]));    
        cfn_matrix = confusion_matrix(y_true, y_pred);
        cfn_matrices += cfn_matrix;
    
    
    score = scores.mean();
    f1score = f1scores.mean();
    f1score_g5 = f1scores_g5.mean();
    f1score_g4 = f1scores_g4.mean();
    f1score_g3 = f1scores_g3.mean();
    f1scores = ';'.join([json.dumps(f1scores.tolist()),
                    json.dumps(f1scores_g5.tolist()),
                    json.dumps(f1scores_g4.tolist()),
                    json.dumps(f1scores_g3.tolist())]);
    
    BaseDados.InserirTeste(Teste=Teste,
                           Target=Target,
                           method=method,
                           hyperP=hyperP,
                           Binario=Binario,
                           PoS=PoS,
                           RemoveStopWords=RemoveStopWords, 
                           features=n_features, 
                           components=n_components,
                           f1score=f1score,
                           f1score_g5=f1score_g5,
                           f1score_g4=f1score_g4,
                           f1score_g3=f1score_g3,
                           f1scores=f1scores,
                           score=score,
                           score_g5=score_g5,
                           score_g4=score_g4,
                           score_g3=score_g3,
                           cfn_matrix=json.dumps(cfn_matrices.tolist()));
    
    return [f1score,f1score_g5,f1score_g4,f1score_g3,cfn_matrices,cfn_matrices_g5,cfn_matrices_g4,cfn_matrices_g3]


In [None]:
#Realiza a varredura de diversos parâmetros e modelos
def VarrerParametrosTestes(Dados,
                             idx_g5,
                             idx_g4,
                             idx_g3,
                             Teste,
                             Target, 
                             kfold):
    methods = ['SVC','RD','KNN','GNB'];

    trials = [];
    for method in [m for m in methods if m != 'GNB']:
        for binario in [0,1]:
            for pos in [0,1]:
                for sw in [0,1]:
                    for hyperP in [(e*2)-1 for e in range(1,10)]:
                        for feat in [e*4 for e in range(12,32)]:
                            for comp in [e*3 for e in range(3,10)]:
                                trials.append([method,hyperP,binario,pos,sw,feat,comp])

    for method in [m for m in methods if m == 'GNB']:
        for binario in [0,1]:
            for pos in [0,1]:
                for sw in [0,1]:
                    for feat in [e*4 for e in range(12,32)]:
                        for comp in [e*3 for e in range(3,10)]:
                            trials.append([method,0,binario,pos,sw,feat,comp])
    
    qtde = len(trials)
    ultima_pct = 0;
    for i in range(0,qtde):
        TestarParametros(Dados,
                         np.array(idx_g5),
                         np.array(idx_g4),
                         np.array(idx_g3),
                         Teste,
                         Target, 
                         trials[i][0],
                         trials[i][1],
                         kfold,
                         trials[i][2],
                         trials[i][3],
                         trials[i][4], 
                         trials[i][5], 
                         trials[i][6]);
        if (math.floor(10*i/qtde) > ultima_pct):
            ultima_pct = math.floor(10*i/qtde)
            print(datetime.datetime.now(),' - ',i,'/',qtde,' - ',ultima_pct*10,'%')
    
    print('- FIM -');

In [None]:
#Código que executa as varreduras de parâmetros e modelos para diferentes grupos de letras

#Cod_Letra,Letra,Total,Valencia,Valencia_QTD_MC,Valencia_QTD_MC_2,Assunto,Assunto_QTD_MC,Assunto_QTD_MC_2

Teste = 'Todos'
print(Teste)
dados = [l for l in BaseDados.GetLetraClassificacoes_MostCommon() if l.Assunto_QTD_MC != l.Assunto_QTD_MC_2]
idx_g5 = [i for i, d in enumerate(dados) if d.Assunto_QTD_MC >= 5]; #Indice das letras G5
idx_g4 = [i for i, d in enumerate(dados) if d.Assunto_QTD_MC == 4]; #Indice das letras G4
idx_g3 = [i for i, d in enumerate(dados) if d.Assunto_QTD_MC == 3]; #Indice das letras G3
kfold = 12;
VarrerParametrosTestes(Dados=dados,
                             idx_g5=idx_g5,
                             idx_g4=idx_g4,
                             idx_g3=idx_g3,
                            Teste=Teste,
                            Target='Assunto', 
                            kfold=kfold);

Teste = 'Todos'
print(Teste)
dados = [l for l in BaseDados.GetLetraClassificacoes_MostCommon() if l.Valencia_QTD_MC != l.Valencia_QTD_MC_2]
idx_g5 = [i for i, d in enumerate(dados) if d.Valencia_QTD_MC >= 5];
idx_g4 = [i for i, d in enumerate(dados) if d.Valencia_QTD_MC == 4];
idx_g3 = [i for i, d in enumerate(dados) if d.Valencia_QTD_MC == 3];
kfold = 12;
VarrerParametrosTestes(Dados=dados,
                             idx_g5=idx_g5,
                             idx_g4=idx_g4,
                             idx_g3=idx_g3,
                            Teste=Teste,
                            Target='Valência', 
                            kfold=kfold);

#Teste = 'Relacionamentos e Reflexões'
#print(Teste)
#dados = [l for l in BaseDados.GetLetraClassificacoes_MostCommon() if l.Assunto_QTD_MC != l.Assunto_QTD_MC_2 and l.Assunto in [1,2]]
#idx_g5 = [i for i, d in enumerate(dados) if d.Assunto_QTD_MC >= 5];
#idx_g4 = [i for i, d in enumerate(dados) if d.Assunto_QTD_MC == 4];
#idx_g3 = [i for i, d in enumerate(dados) if d.Assunto_QTD_MC == 3];
#kfold = 12;
#VarrerParametrosTestes(Dados=dados,
#                             idx_g5=idx_g5,
#                             idx_g4=idx_g4,
#                             idx_g3=idx_g3,
#                            Teste=Teste,
#                            Target='Assunto', 
#                            kfold=kfold);
#

In [None]:
#Cria a base que registrara os resultados ML
BaseDados.CreateBaseTestes()