In [459]:
import pandas as pd
from statistics import median
import re

In [451]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/alisayanovski/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [452]:
STOPWORDS_ENG = stopwords.words('english')

In [502]:
# helper functions
def clean_text(text):
    '''
    Takes text (str) as parameter. 
    Returns list of words from text without capital letters and punctuation.
    '''
    return list(filter(None, re.sub(r'[^\w\\s]', ' ', text.lower()).split(' ')))

def remove_stopwords(words_list, stopwords_list):
    '''
    Takes list of words and list of stopwords as parameters.
    Returns list of words cleaned of stopwords.
    '''
    unique_words = set(words_list)
    return [word for word in unique_words if word not in stopwords_list]

def get_unique_words_list(text, stopwords_list):
    '''
    Takes text (str) and list of stopwords as parameters. 
    Returns list of words with no repetitions.
    '''
    words_list = clean_text(text)
    unique_words = set(words_list)
    unique_words_no_stopwords = remove_stopwords(words_list, stopwords_list)
    
    return unique_words_no_stopwords

In [474]:
def count_unique_words_rate(text, stopwords_list):
    '''
    Takes text (str) and list of stopwords as parameters.
    Returns ratio of unique words in text (float).
    '''
    total_words = clean_text(text)
    unique_words_no_stopwords = get_unique_words_list(' '.join(total_words), stopwords_list)
    
    return round(len(unique_words_no_stopwords) / len(total_words), 2)


def count_total_char(text):
    '''
    Takes text (str) as parameter. 
    Returns total number of characters without punctuation.
    '''
    return len(re.sub(r'[^\w\\s]', '', text))


def get_repetitions_rate(text, stopwords_list):
    '''
    Takes text (str) and list of stopwords as parameters. 
    Returns mean average number of repetitions of word, if it appears more than once (float).
    '''
    unique_words = get_unique_words_list(text, stopwords_list)

    dict_words = {}

    for word in text:
        if word:
            if word in dict_words:
                dict_words[word] += 1

            else:
                dict_words[word] = 1

    repeating_words = []  

    for key, value in dict_words.items():
        if value > 1:
            repeating_words.append(value)
    
    if repeating_words:
        return round(sum(repeating_words) / len(repeating_words), 2)
    else: 
        return 0


def get_repetitions_rate_median(text, stopwords_list):
    '''
    Takes text (str) and list of stopwords as parameters. 
    Returns median average number of repetitions of word, if it appears more than once (float).
    '''
    unique_words = get_unique_words_list(text, stopwords_list)

    dict_words = {}

    for word in text:
        if word:
            if word in dict_words:
                dict_words[word] += 1

            else:
                dict_words[word] = 1

    repeating_words = []  

    for key, value in dict_words.items():
        if value > 1:
            repeating_words.append(value)
    
    if repeating_words:
        return median(repeating_words)
    else: 
        return 0
    
def count_total_words(text):
    '''
    Takes text (str) as parameter. 
    Returns number of words (int).
    '''
    return len(clean_text(text))


def get_mean_word_len(text, stopwords_list):
    '''
    Takes text (str) and list of stopwords as parameters.  
    Returns average number of characters per word (float).
    '''
    words_list = clean_text(text)
    total_words_no_stopwords = remove_stopwords(words_list, stopwords_list)

    return round(len(''.join(total_words_no_stopwords)) / len(get_unique_words_list(text, stopwords_list)), 2)


def get_mean_sentence_len(text):
    '''
    Takes text (str) as parameter.  
    Returns average number of words per line (float).
    '''
    sentence_list = list(filter(None, text.split('\n')))
    line_len_list = [len(el) for el in sentence_list]
    return round(sum(line_len_list) / len(sentence_list), 2)

def count_word(word, text):
    '''
    Takes text (str) and a word as parameters.  
    Returns number of word appearance in the text.
    '''
    words_list = clean_text(text)
    return words_list.count(word)

In [624]:
def get_metrics(text):
    '''
    Takes text (str) as parameter. 
    Returns tuple of metrics for this text.
    '''
    STOPWORDS_LIST = stopwords.words('english')
    
    repetitions_rate = get_repetitions_rate(text, STOPWORDS_LIST)
    mean_sentence_len = get_mean_sentence_len(text)
    mean_word_len = get_mean_word_len(text, STOPWORDS_LIST)
    unique_words_rate = count_unique_words_rate(text, STOPWORDS_LIST)
    total_words = count_total_words(text)
    total_char = count_total_char(text)

    # new functions
    repetitions_rate_median = get_repetitions_rate_median(text, STOPWORDS_LIST)
    
    # count words
    words_counts = []
    for word in top_repeaded_words_no_stopwords: ###
        words_counts.append(count_word(word, text))
        
        
    return (total_char, total_words, mean_word_len, mean_sentence_len, unique_words_rate, repetitions_rate,
           repetitions_rate_median, *words_counts)


def write_row_to_csv(df, values_tuple):
    '''
    Takes df and metrics tuple as parameters. 
    Writes values to a df. Df should already exist!
    '''
    for i in range(len(values_tuple)):
        df['total_char'][i] = values_tuple[i][0]
        df['total_words'][i] = values_tuple[i][1]
        df['mean_word_len'][i] = values_tuple[i][2]
        df['mean_sentence_len'][i] = values_tuple[i][3]
        df['unique_words_rate'][i] =values_tuple[i][4]
        df['repetitions_rate'][i] = values_tuple[i][5]
        
        #new
        df['repetitions_rate_median'][i] = values_tuple[i][6]
        
        # count words
        j = 0
        for word in top_repeaded_words_no_stopwords: ###
            j += 1
            df[f'count_word_{word}'][i] = values_tuple[i][6 + j]


In [630]:
def count_top_repeating_words(text, words_num):
    cleaned_text = clean_text(text.replace('\\n', ' ').replace('\\', ' '))
    
    dict_all_words = {}

    for word in cleaned_text:
        if word:
            if word in dict_all_words:
                dict_all_words[word] += 1

            else:
                dict_all_words[word] = 1
                
    sorted_words = [k for k, v in sorted(dict_all_words.items(), key=lambda item: item[1], reverse=True)]
    
    return sorted_words[:words_num]

In [631]:
top_repeated_words = count_top_repeating_words(df_lyrics['lyrics'].to_string(index=False), 50)

In [632]:
top_repeaded_words_no_stopwords = [word for word in top_repeated_words if word not in STOPWORDS_ENG]

In [633]:
lyrics_raw = df_lyrics['lyrics'].to_string(index=False)
cleaned_lyrics_list = clean_text(lyrics_raw)
lyrics_no_stopwords = [word for word in cleaned_lyrics_list if word not in STOPWORDS_ENG]
clean_lyrics_string = ' '.join(lyrics_no_stopwords)

top_repeaded_words_no_stopwords = count_top_repeating_words(clean_lyrics_string, 20)

In [599]:
df_lyrics = pd.read_csv('final_csv.csv')

In [635]:
additional_columns = [f'count_word_{word}' for word in top_repeaded_words_no_stopwords]

df_metrics = pd.DataFrame(df_lyrics, columns=['repetitions_rate',
                                            'mean_sentence_len',
                                            'mean_word_len',
                                            'unique_words_rate',
                                            'total_words',
                                            'total_char',
                                            'repetitions_rate_median',
                                             *additional_columns])

lyrics = df_lyrics['lyrics']
write_row_to_csv(df_metrics, lyrics.apply(get_metrics))

In [636]:
df_final = pd.concat([df_lyrics, df_metrics], axis="columns")
df_final.to_csv('lyrics_metrics.csv', index=False)

In [637]:
pd.read_csv('lyrics_metrics.csv')

Unnamed: 0,song,artist_name,lyrics,artist_id,track_id,repetitions_rate,mean_sentence_len,mean_word_len,unique_words_rate,total_words,...,count_word_one,count_word_love,count_word_the,count_word_go,count_word_time,count_word_get,count_word_me,count_word_when,count_word_come,count_word_all
0,(You Drive Me) Crazy,Britney Spears,"Crazy\nOh, oh\n\nBaby, I'm so into you\nYou go...",26dSoYclwsYLMAKD3tpOr4,1DSJNBNhGZCigg9ll5VeZv,30.41,31.68,4.98,0.15,271.0,...,1.0,0.0,4.0,0.0,1.0,0.0,17.0,0.0,0.0,5.0
1,100 Years,Five For Fighting,I'm 15 for a moment\nCaught in between 10 and ...,7FgMLbnZVrEnir95O0YujA,2lFlveK1y13WWp3vnQtrr3,29.66,26.55,4.43,0.23,281.0,...,0.0,0.0,7.0,0.0,11.0,0.0,0.0,3.0,0.0,1.0
2,11 Blocks,Wrabel,11 blocks from my door to your doorstep\nThree...,7r2uG6BlFXKcwmh9ItqlII,7nZBRPj89rgeZ5eBLp2J7P,38.60,33.20,4.95,0.19,345.0,...,0.0,2.0,7.0,0.0,0.0,2.0,4.0,4.0,0.0,0.0
3,1985,Bowling For Soup,"Debbie just hit the wall, she never had it all...",5ND0mGcL9SKSjWIjPd0xIb,5oQcOu1omDykbIPSdSQQNJ,36.77,48.19,4.94,0.31,343.0,...,2.0,0.0,9.0,0.0,1.0,1.0,0.0,4.0,0.0,2.0
4,2002,Anne-Marie,I will always remember\nThe day you kissed my ...,1zNqDE7qDGCsyzJwohVaoX,2BgEsaKNfHUdlh97KmvFyo,44.02,28.54,4.95,0.19,387.0,...,3.0,10.0,26.0,3.0,3.0,0.0,9.0,4.0,0.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4386,Burnin' Up,Jonas Brothers,\n\nI'm hot\nYou're cold\nYou go around\nLike ...,7gOdHgIoIKoe4i9Tta6qdD,2VEsmoek0sol9MnJFyoG9e,39.89,25.98,4.64,0.22,314.0,...,0.0,0.0,15.0,1.0,0.0,1.0,3.0,0.0,3.0,2.0
4387,We R Who We R,Kesha,"\n\nHot and dangerous\nIf you're one of us, th...",6LqNN22kT3074XbTVUrhzX,3LUWWox8YYykohBbHUrrxd,47.88,38.29,5.01,0.16,456.0,...,1.0,1.0,7.0,4.0,1.0,0.0,0.0,1.0,0.0,2.0
4388,Domino,Jessie Ware,\n\nWhat can I say? What can I do?\nIf it's al...,5Mq7iqCWBzofK39FBqblNc,6MAdEUilV2p9RQUqE5bMAK,36.85,33.58,5.30,0.14,293.0,...,0.0,0.0,1.0,0.0,0.0,0.0,11.0,1.0,0.0,2.0
4389,Your Love Is My Drug,Kesha,\n\nMaybe I need some rehab\nOr maybe just nee...,6LqNN22kT3074XbTVUrhzX,6vc2Jq2vaGu8z326kSrw92,46.77,31.14,5.13,0.20,397.0,...,0.0,55.0,5.0,0.0,2.0,3.0,2.0,2.0,0.0,5.0
