In [1]:
import pandas as pd
import csv
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Lambda, LSTM, Dense, Dropout, Input
import os
import re

## Data Preprocessing

In [2]:
rows = []
csv_path = 'data/id_genres.csv'
with open(csv_path, 'r', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    
    header = next(csv_reader)[0].split('\t')
    print(header)
    for row in csv_reader:
        temp = row[0].split('\t')
        if len(row) > 2:
            temp.extend(row[2:])
        rows.append(temp)

['id', 'genres']


In [3]:
header = ['id', 'genre1', 'genre2', 'genre3', 'genre4', 'genre5', 'genre6', 'genre7']

In [4]:
df = pd.DataFrame(rows, columns=header)
df

Unnamed: 0,id,genre1,genre2,genre3,genre4,genre5,genre6,genre7
0,0009fFIM1eYThaPg,pop,,,,,,
1,0010xmHR6UICBOYT,underground hip hop,,,,,,
2,002Jyd0vN4HyCpqL,hard rock,classic rock,,,,,
3,006TYKNjNxWjfKjy,symphonic metal,symphonic power metal,,,,,
4,007LIJOPQ4Sb98qV,post-punk,,,,,,
...,...,...,...,...,...,...,...,...
109264,zzyyPUs7hC9Nz2e1,hardcore punk,hardcore,,,,,
109265,zzz0n04uuTUA7fNh,pop,,,,,,
109266,zzzj3LYaZtYtbzSr,singer-songwriter,,,,,,
109267,zzznMjZAKnJJXQSj,synthpop,pop,,,,,


In [5]:
melt = pd.melt(df, id_vars=['id'], value_vars=df.columns[1:])
allocc = melt['value'].value_counts()
top10occ = melt['value'].value_counts().head(10)
top10occ

value
pop                 16810
rock                15947
electronic           8855
indie rock           5730
alternative rock     5292
folk                 4559
soul                 3985
classic rock         3809
metal                3639
rap                  3623
Name: count, dtype: int64

In [6]:
melt['value'].value_counts().head(17)

value
pop                  16810
rock                 15947
electronic            8855
indie rock            5730
alternative rock      5292
folk                  4559
soul                  3985
classic rock          3809
metal                 3639
rap                   3623
singer-songwriter     3445
experimental          2980
hard rock             2947
indie pop             2810
post-hardcore         2506
ambient               2458
punk                  2458
Name: count, dtype: int64

In [7]:
top10genres = set(top10occ.keys())
sum(top10occ.values)

72249

In [8]:
users_with_top10 = melt[melt['value'].isin(top10genres)]['id'].drop_duplicates()

In [9]:
filtered_users = df[df['id'].isin(users_with_top10)]

In [10]:
filtered_users

Unnamed: 0,id,genre1,genre2,genre3,genre4,genre5,genre6,genre7
0,0009fFIM1eYThaPg,pop,,,,,,
2,002Jyd0vN4HyCpqL,hard rock,classic rock,,,,,
5,00CH4HJdxQQQbJfu,indie rock,shoegaze,experimental,,,,
6,00DZ3XCAQb2FdCc6,ambient,electronic,,,,,
8,00IeldeA9ijZOL0P,pop,,,,,,
...,...,...,...,...,...,...,...,...
109262,zzx8CWdM7qkxKQpC,indie rock,,,,,,
109263,zzyb5LvKJTWLVnrk,soul,mpb,funk,,,,
109265,zzz0n04uuTUA7fNh,pop,,,,,,
109267,zzznMjZAKnJJXQSj,synthpop,pop,,,,,


In [11]:
final_df = pd.DataFrame()
final_df['id'] = filtered_users['id']

In [12]:
final_df['genre'] = df.iloc[:, 1:].apply(lambda row: next((x for x in row if x in top10genres), None), axis=1)

In [13]:
final_df

Unnamed: 0,id,genre
0,0009fFIM1eYThaPg,pop
2,002Jyd0vN4HyCpqL,classic rock
5,00CH4HJdxQQQbJfu,indie rock
6,00DZ3XCAQb2FdCc6,electronic
8,00IeldeA9ijZOL0P,pop
...,...,...
109262,zzx8CWdM7qkxKQpC,indie rock
109263,zzyb5LvKJTWLVnrk,soul
109265,zzz0n04uuTUA7fNh,pop
109267,zzznMjZAKnJJXQSj,pop


In [14]:
rows = []
csv_path = 'data/music4all_subset/music4all_subset/id_lang.csv'
with open(csv_path, 'r', newline='') as csvfile:
    csv_reader = csv.reader(csvfile)
    
    header = next(csv_reader)[0].split('\t')
    print(header)
    for row in csv_reader:
        temp = row[0].split('\t')
        if len(row) > 2:
            temp.extend(row[2:])
        rows.append(temp)

['id', 'lang']


In [15]:
df_filter_en = pd.DataFrame(rows, columns=['id', 'lang'])

In [16]:
fin_fil_df = final_df[final_df['id'].isin(df_filter_en[df_filter_en['lang'] == 'en']['id'])]

In [17]:
fin_fil_df

Unnamed: 0,id,genre
0,0009fFIM1eYThaPg,pop
2,002Jyd0vN4HyCpqL,classic rock
5,00CH4HJdxQQQbJfu,indie rock
8,00IeldeA9ijZOL0P,pop
9,00KSCJkYb8JKa4Y3,rock
...,...,...
109258,zzoFYDMlqU1X2zz1,soul
109262,zzx8CWdM7qkxKQpC,indie rock
109265,zzz0n04uuTUA7fNh,pop
109267,zzznMjZAKnJJXQSj,pop


In [19]:
directory_path = 'data/music4all_subset/music4all_subset/lyrics'
file_prefixes = fin_fil_df['id'].tolist()
lyrics_d = {}
lyrics_l = []
for prefix in file_prefixes:
    file_pattern = prefix + '.txt'
    file_path = os.path.join(directory_path, file_pattern)
    if os.path.exists(file_path):
        with open(file_path, 'r') as file:
            content = file.read()
            lyrics_d[prefix] = content
            lyrics_l.append(content)
            # print(f"Content of {file_pattern}:\n{content}")

In [None]:
def preprocess_lyrics(lyli):
    def prep_single(ly):
        text = re.sub(r'[^\w\s]', '', ly).lower()
        text = text.replace('\n', ' ')
        text = re.sub(' +', ' ', text)
        return text
    new = [prep_single(l) for l in lyli]
    return new

In [None]:
prep_lyrics = preprocess_lyrics(lyrics_l)

In [None]:
elmo = hub.load('https://tfhub.dev/google/elmo/2').signatures['default']

In [None]:
embeddings = elmo(tf.constant(prep_lyrics))['elmo']