# 1. Import Packages and Libraries

In [1]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Embedding
import keras.backend as K
from keras.models import load_model
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import xgboost

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score,f1_score,confusion_matrix,recall_score

import scipy
import pandas as pd
import numpy as np
import gensim

import nltk
from nltk.data import find
import matplotlib.pyplot as plt
import shap

import matplotlib
import sklearn
import pickle
import random
import multiprocessing
import os
import sys

import lyricsgenius as lg
%load_ext dotenv
%dotenv

# 2. Load Preprocessing Functions and Model

#### Preprocess Text

In [2]:
def preprocess_text(text):
    text = text.lower()
    text = text.replace('\n', ' ')
    text = text.replace('  ',' ')
    return text

#### Label Maps

In [9]:
label_maps = {'Spanish': 0,'Portuguese': 1,'English': 2,'Kinyarwanda': 3,'Italian': 4,'French': 5,'German': 6,
 'Other': 7,'Finnish': 8,'Swedish': 9,'Romanian': 10}
label_to_language = {}
for key,value in label_maps.items():
    label_to_language[value] = key

#### Class Recall + Load Model

In [10]:
def class_recall(y_true,y_pred):
    #true labels
    true = y_true.numpy()
    #predicted prob of each class for each sample
    pred = y_pred.numpy()
    #prob to class based off max predicted prob
    pred = np.array([x.argmax() for x in pred])
    #confusion matrix
    confuse = confusion_matrix(true,pred)
    confuse_sum = confuse.sum(axis=1)
    score = 0
    for num in range(len(confuse_sum)):
        if confuse_sum[num]!=0:
            score = score + confuse[num][num]/confuse_sum[num]
    
    return score/len(confuse_sum)

In [12]:
def load_language_detection_model(filepath):
    def class_recall(y_true,y_pred):
        #true labels
        true = y_true.numpy()
        #predicted prob of each class for each sample
        pred = y_pred.numpy()
        #prob to class based off max predicted prob
        pred = np.array([x.argmax() for x in pred])
        #confusion matrix
        confuse = confusion_matrix(true,pred)
        confuse_sum = confuse.sum(axis=1)
        score = 0
        for num in range(len(confuse_sum)):
            if confuse_sum[num]!=0:
                score = score + confuse[num][num]/confuse_sum[num]

        return score/len(confuse_sum)
    
    model = load_model(filepath,custom_objects={'class_recall':class_recall})
    return model

language_detection_model = load_language_detection_model('language_detection_ff_tf.h5')    

#### Word Vectorizer

In [3]:
vectorizer = pickle.load(open('word_vectorizer.pkl','rb'))

#### Function to pull song lyrics, around Genius API

In [4]:
genius_api = lg.Genius(os.getenv('genius_token'))
genius_api.verbose = False

def get_song_lyrics(song_name,artist_name,genius_api):
    try:
        lyrics = genius_api.search_song(song_name,artist_name).lyrics
    except:
        lyrics = ''
    
    return lyrics

#### Text to Term Density

In [34]:
sample_lyrics = ['hello my name is Amit and I am going to the store',
 'hola mi nombre es Amit y voy a la tienda',
 "bonjour je m'appelle Amit et je vais au magasin",
 "Olá meu nome é Amit e estou indo para a loja",
 "ciao mi chiamo Amit e vado al negozio",
 "Halō nā pēru amit mariyu nēnu dukāṇāniki veḷtunnānu",
 "me gustan los pepinillos"]

def lyrics_to_term_density(lyrics,vectorizer):
    bag_of_words = vectorizer.transform(lyrics).todense()
    word_counts = bag_of_words.sum(axis=1).repeat(len(vectorizer.get_feature_names())).reshape(bag_of_words.shape)
    term_density = bag_of_words/word_counts
    term_density = np.nan_to_num(term_density)
    return term_density

#### Lyrics to Language Prediction

In [35]:
def language_prediction(lyrics,vectorizer,model,label_to_language):
    
    #Lyrics to Term Density
    def lyrics_to_term_density(lyrics,vectorizer):
        bag_of_words = vectorizer.transform(lyrics).todense()
        word_counts = bag_of_words.sum(axis=1).repeat(len(vectorizer.get_feature_names())).reshape(bag_of_words.shape)
        term_density = bag_of_words/word_counts
        term_density = np.nan_to_num(term_density)
        return term_density
    
    term_density_lyrics = lyrics_to_term_density(lyrics,vectorizer)
    preds = [pred.argmax() for pred in model.predict(term_density_lyrics,verbose=0)]
    preds_language = [label_to_language[label] for label in preds]
    return preds, preds_language

In [36]:
language_prediction(sample_lyrics,vectorizer,language_detection_model,label_to_language)

invalid value encountered in true_divide


([2, 0, 5, 1, 4, 7, 0],
 ['English', 'Spanish', 'French', 'Portuguese', 'Italian', 'Other', 'Spanish'])

#### Lyrics to Language Prediction w/ Song From Genius

In [37]:
def language_prediction_genius(song_name,artist_name,genius_api,vectorizer,model,label_to_language):
    # get song lyrics
    def get_song_lyrics(song_name,artist_name,genius_api):
        try:
            lyrics = genius_api.search_song(song_name,artist_name).lyrics
        except:
            lyrics = ''

        return lyrics
    
    # return prediction for lyrics
    def language_prediction(lyrics,vectorizer,model,label_to_language):
    
        #Lyrics to Term Density
        def lyrics_to_term_density(lyrics,vectorizer):
            bag_of_words = vectorizer.transform(lyrics).todense()
            word_counts = bag_of_words.sum(axis=1).repeat(len(vectorizer.get_feature_names())).reshape(bag_of_words.shape)
            term_density = bag_of_words/word_counts
            term_density = np.nan_to_num(term_density)
            return term_density

        term_density_lyrics = lyrics_to_term_density(lyrics,vectorizer)
        preds = [pred.argmax() for pred in model.predict(term_density_lyrics,verbose=0)]
        preds_language = [label_to_language[label] for label in preds]
        return preds, preds_language
    
    lyrics = [get_song_lyrics(song_name,artist_name,genius_api)]
    preds,preds_language = language_prediction(lyrics,vectorizer,model,label_to_language)
    return preds,preds_language

In [40]:
language_prediction_genius('Gasolina','Daddy Yankee',genius_api,vectorizer,language_detection_model,label_to_language)

([0], ['Spanish'])