In [1]:
import pandas
import simplemma
import spacy
import nltk
from collections import Counter
import emoji

In [2]:
model_es = spacy.load("es_core_news_sm")


In [3]:
df_verbs = pandas.read_csv("./resources/verbs_database.csv")
df_verbs.head(3)


Unnamed: 0,infinitive,infinitive_english,mood,mood_english,tense,tense_english,verb_english,form_1s,form_2s,form_3s,form_1p,form_2p,form_3p,gerund,gerund_english,pastparticiple,pastparticiple_english
0,abandonar,"to abandon, leave behind, desert; to quit, giv...",Indicativo,Indicative,Presente,Present,"I abandon, am abandoning",abandono,abandonas,abandona,abandonamos,abandonáis,abandonan,abandonando,abandoning,abandonado,abandoned
1,abandonar,"to abandon, leave behind, desert; to quit, giv...",Indicativo,Indicative,Futuro,Future,I will abandon,abandonaré,abandonarás,abandonará,abandonaremos,abandonaréis,abandonarán,abandonando,abandoning,abandonado,abandoned
2,abandonar,"to abandon, leave behind, desert; to quit, giv...",Indicativo,Indicative,Imperfecto,Imperfect,"I was abandoning, used to abandon, abandoned",abandonaba,abandonabas,abandonaba,abandonábamos,abandonabais,abandonaban,abandonando,abandoning,abandonado,abandoned


In [4]:
def diversidad_lexica(texto):
    tokens = simplemma.simple_tokenizer(texto)
    return len(set(tokens)) / len(tokens)


def numero_oraciones(text: str) -> int:
    oraciones = nltk.tokenize.sent_tokenize(text, language="spanish")
    return len(oraciones)


def palabras_promedio_oraciones(text: str) -> float:
    oraciones = nltk.tokenize.sent_tokenize(text, language="spanish")
    longitud_promedio = sum(len(oracion.split()) for oracion in oraciones) / len(oraciones)
    return longitud_promedio


def numero_oraciones(text: str) -> int:
    oraciones = nltk.tokenize.sent_tokenize(text, language="spanish")
    return len(oraciones)


def count_gramatical_categories_and_forms(text: str) -> pandas.Series:
    GRAMATICAL_CATEGORIES_AND_FORMS = {
        "stop_words": 0,
        "alpha_words": 0,
        "NOUN": 0,
        "ADJ": 0,
        "VERB": 0,
        "PRON": 0,
        "ADV": 0,
        "DET": 0,
        "CCONJ": 0,
        "INTJ": 0,
        "SCONJ": 0,
        "AUX": 0,
        "PUNCT": 0,
        "PROPN": 0,
        "NUM": 0,
        "form_1s": 0,
        "form_2s": 0,
        "form_3s": 0,
        "form_1p": 0,
        "form_2p": 0,
        "form_3p": 0,
        "gerund": 0,
        "infinitive": 0,
        "Presente": 0,
        "Futuro": 0,
        "Imperfecto": 0,
        "Pretérito": 0,
        "Condicional": 0,
        "Presente perfecto": 0,
        "Futuro perfecto": 0,
        "Pluscuamperfecto": 0,
        "Pretérito anterior": 0,
        "Condicional perfecto": 0,
        "Indicativo": 0,
        "Subjuntivo": 0,
        "Imperativo Afirmativo": 0,
        "Imperativo Negativo": 0,
    }
    doc = model_es(text.lower())
    for token in doc:
        if token.is_stop:
            GRAMATICAL_CATEGORIES_AND_FORMS["stop_words"] += 1
        if token.is_alpha:
            GRAMATICAL_CATEGORIES_AND_FORMS["alpha_words"] += 1
        if token.pos_ in GRAMATICAL_CATEGORIES_AND_FORMS:
            GRAMATICAL_CATEGORIES_AND_FORMS[token.pos_] += 1
        if token.pos_ == "VERB":
            try:
                idx, form = df_verbs.where(df_verbs == token.text.lower()).stack().index[0]
                if form in GRAMATICAL_CATEGORIES_AND_FORMS:
                    GRAMATICAL_CATEGORIES_AND_FORMS[form] += 1
                GRAMATICAL_CATEGORIES_AND_FORMS[df_verbs.loc[idx, "tense"]] += 1
                GRAMATICAL_CATEGORIES_AND_FORMS[df_verbs.loc[idx, "mood"]] += 1
            except:
                pass
    return pandas.Series(GRAMATICAL_CATEGORIES_AND_FORMS)


In [5]:
df = pandas.read_csv("../../../data/data_processed/normalize2.csv")
df = df.dropna()
# df["fuente_num"] = df["Tipo de fuente"].apply(label_num_fuente)
# df["clasificacion_num"] = df["Clasificacion"].apply(label_num_clasificacion)
df


Unnamed: 0,Genero,Artista,Titulo,Cancion,normalize_2
0,0,Shakira,Waka Waka (Esto Es África),"Llegó el momento, caen las murallas\r\nVa a co...",llegó el momento caen las murallas va a comenz...
1,0,Shakira,Gitana,Nunca usé un antifaz\r\nVoy de paso por este m...,nunca usé un antifaz voy de paso por este mund...
2,0,Shakira,"Te Aviso, Te Anuncio (Tango)",Nunca pensé que doliera el amor así\r\nCuando ...,nunca pensé que doliera el amor así cuando se ...
3,0,Shakira,Addicted To You,Debe ser el perfume que usas\r\nO el agua con ...,debe ser el perfume que usas o el agua con la ...
4,0,Shakira,Monotonía (part. Ozuna),"No fue culpa tuya, ni tampoco mía\r\nFue culpa...",no fue culpa tuya ni tampoco mía fue culpa de ...
...,...,...,...,...,...
3060,4,Willie Gonzalez,No Es Casualidad,Siempre cada encuentro parece el primero\r\nSe...,siempre cada encuentro parece el primero se de...
3061,4,Willie Gonzalez,No Podras Escapar de Mi,Poder tocar tu mano\r\nEstar siempre a tu lado...,poder tocar tu mano estar siempre a tu lado es...
3062,4,Willie Gonzalez,Doble Vida,Juntos viven tu y el\r\nY estas tan sola...\r\...,juntos viven tu y el y estas tan sola en tus h...
3063,4,Willie Gonzalez,Tan Solo,Me has vuelto a llamar\r\nY como un niño emoci...,me has vuelto a llamar y como un niño emociona...


In [6]:
df["diversidad_lexica"] = df["Cancion"].apply(diversidad_lexica)
df["numero_oraciones"] = df["Cancion"].apply(numero_oraciones)
df["palabras_promedio_oraciones"] = df["Cancion"].apply(palabras_promedio_oraciones)
df["numero_oraciones"] = df["Cancion"].apply(numero_oraciones)
df[
    [
        "stop_words",
        "alpha_words",
        "noun",
        "adj",
        "verb",
        "pron",
        "adv",
        "det",
        "cconj",
        "intj",
        "sconj",
        "aux",
        "punct",
        "propn",
        "num",
        "form_1s",
        "form_2s",
        "form_3s",
        "form_1p",
        "form_2p",
        "form_3p",
        "gerund",
        "infinitive",
        "Presente",
        "Futuro",
        "Imperfecto",
        "Pretérito",
        "Condicional",
        "Presente perfecto",
        "Futuro perfecto",
        "Pluscuamperfecto",
        "Pretérito anterior",
        "Condicional perfecto",
        "Indicativo",
        "Subjuntivo",
        "Imperativo Afirmativo",
        "Imperativo Negativo",
    ]
] = (
    df["Cancion"].apply(count_gramatical_categories_and_forms).reset_index(drop=True)
)
# df["emoji_count"] = df["Cancion"].apply(lambda text: emoji.emoji_count(text))
# df["hashtags_count"] = df["Cancion"].apply(lambda text: text.count("#"))
# df["links_count"] = df["Cancion"].apply(lambda text: text.count("http"))
# df["tags_count"] = df["Cancion"].apply(lambda text: text.count("@"))
# df["retweets_count"] = df["Cancion"].apply(lambda text: text.count("RT"))
df


Unnamed: 0,Genero,Artista,Titulo,Cancion,normalize_2,diversidad_lexica,numero_oraciones,palabras_promedio_oraciones,stop_words,alpha_words,...,Condicional,Presente perfecto,Futuro perfecto,Pluscuamperfecto,Pretérito anterior,Condicional perfecto,Indicativo,Subjuntivo,Imperativo Afirmativo,Imperativo Negativo
0,0,Shakira,Waka Waka (Esto Es África),"Llegó el momento, caen las murallas\r\nVa a co...",llegó el momento caen las murallas va a comenz...,0.375839,1,274.000000,81.0,274.0,...,0.0,0.0,0.0,0.0,0.0,0.0,20.0,3.0,0.0,0.0
1,0,Shakira,Gitana,Nunca usé un antifaz\r\nVoy de paso por este m...,nunca usé un antifaz voy de paso por este mund...,0.463203,1,221.000000,127.0,221.0,...,0.0,0.0,0.0,0.0,0.0,0.0,36.0,5.0,0.0,0.0
2,0,Shakira,"Te Aviso, Te Anuncio (Tango)",Nunca pensé que doliera el amor así\r\nCuando ...,nunca pensé que doliera el amor así cuando se ...,0.357143,1,314.000000,217.0,314.0,...,0.0,0.0,0.0,0.0,0.0,0.0,36.0,6.0,0.0,0.0
3,0,Shakira,Addicted To You,Debe ser el perfume que usas\r\nO el agua con ...,debe ser el perfume que usas o el agua con la ...,0.495763,1,220.000000,117.0,212.0,...,1.0,0.0,0.0,0.0,0.0,0.0,23.0,0.0,0.0,0.0
4,0,Shakira,Monotonía (part. Ozuna),"No fue culpa tuya, ni tampoco mía\r\nFue culpa...",no fue culpa tuya ni tampoco mía fue culpa de ...,0.425676,1,269.000000,196.0,266.0,...,3.0,0.0,0.0,0.0,0.0,0.0,36.0,2.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3060,4,Willie Gonzalez,No Es Casualidad,Siempre cada encuentro parece el primero\r\nSe...,siempre cada encuentro parece el primero se de...,0.365591,5,66.600000,180.0,287.0,...,0.0,0.0,0.0,0.0,0.0,0.0,33.0,4.0,0.0,0.0
3061,4,Willie Gonzalez,No Podras Escapar de Mi,Poder tocar tu mano\r\nEstar siempre a tu lado...,poder tocar tu mano estar siempre a tu lado es...,0.342767,1,287.000000,189.0,349.0,...,0.0,0.0,0.0,0.0,0.0,0.0,42.0,4.0,1.0,0.0
3062,4,Willie Gonzalez,Doble Vida,Juntos viven tu y el\r\nY estas tan sola...\r\...,juntos viven tu y el y estas tan sola en tus h...,0.347594,3,116.333333,305.0,448.0,...,0.0,0.0,0.0,0.0,0.0,0.0,62.0,3.0,0.0,0.0
3063,4,Willie Gonzalez,Tan Solo,Me has vuelto a llamar\r\nY como un niño emoci...,me has vuelto a llamar y como un niño emociona...,0.292247,1,448.000000,195.0,321.0,...,17.0,0.0,0.0,0.0,0.0,0.0,31.0,2.0,0.0,0.0


In [7]:
df.to_csv("./database_no_lexico.csv", index=False)