In [None]:
import numpy as np
import pandas as pd 
import os

In [None]:
input_dir = '/kaggle/input'
tsvfiles = []
for dirname, _, filenames in os.walk(input_dir):
    for filename in filenames:
        if filename.endswith('.tsv'):
            file_path = os.path.join(dirname, filename)
            print(file_path)
            tsvfiles.append(file_path)

print(tsvfiles)

In [None]:
tsv_raw = '/kaggle/input/moodtheme-1/raw_30s - raw_30s.tsv'
tsv_raw_Clean = '/kaggle/input/moodtheme-1/raw_30s_cleantags.tsv'
df_raw = pd.read_csv(tsv_raw, delimiter = '\t')
df_raw_cleantags = pd.read_csv(tsv_raw_Clean, delimiter = '\t')

In [None]:
print("Original raw df shape:", df_raw.shape)
print("Original raw df cleantags shape:", df_raw_cleantags.shape)

In [None]:
mismatch = df_raw != df_raw_cleantags
mismatched_cols = mismatch.any(axis=0)
columns_with_mismatches = mismatched_cols[mismatched_cols].index.tolist()

print("Columns with mismatches:", columns_with_mismatches)

In [None]:
print(df_raw[df_raw.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:
def filter_tagged_musical_instruments(df):
    all_tags = df.iloc[:,5:]
    contains_instrument = all_tags.apply(lambda x: x.astype(str).str.contains('instrument---', na=False)).any(axis=1)
    contains_mood = all_tags.apply(lambda x: x.astype(str).str.contains('mood/theme---', na=False)).any(axis=1)
    
    mask = contains_instrument & contains_mood
    
    filtered_df = df[mask]
    contains_novoice = ~filtered_df.apply(lambda x: x.astype(str).str.contains('instrument---voice', na=False)).any(axis=1)
    novoice_instr_mood = filtered_df[contains_novoice]
    return novoice_instr_mood

In [None]:
df_raw_filtered = filter_tagged_musical_instruments(df_raw)
df_raw_cleantags_filtered = filter_tagged_musical_instruments(df_raw_cleantags)
print("Filtered Raw shape:", df_raw_filtered.shape)
print("Filtered Raw clean tags shape:", df_raw_cleantags_filtered.shape)

In [None]:
#rows_df_raw.iloc[0]==rows_df_raw_cleantags.iloc[0]
print(df_raw_filtered[df_raw_filtered.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:
def filter_unknown(df):
    all_tags = df.iloc[:,5:]
    has_unknown = all_tags.apply(lambda x: x.astype(str).str.contains('unknown', na=False)).any(axis=1)
    df_ = df[has_unknown]
    return df_

unknow_df = filter_unknown(df_raw_filtered)
print(unknow_df)

In [None]:
df_raw_filtered = df_raw_filtered.fillna('NaN')
df_raw_cleantags_filtered = df_raw_cleantags_filtered.fillna('NaN')

In [None]:
print(df_raw_filtered[df_raw_filtered.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:
filtered_mismatches = df_raw_filtered!=df_raw_cleantags_filtered
row_mismatches = filtered_mismatches.any(axis=1)
rows_df_raw = df_raw_filtered[row_mismatches]
rows_df_raw_cleantags = df_raw_cleantags_filtered[row_mismatches]

In [None]:
print(rows_df_raw.iloc[0]['TAGS-5'])
print(rows_df_raw_cleantags.iloc[0]['TAGS-5'])

In [None]:
print(df_raw_filtered[df_raw_filtered.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:

def generate_unique_moods(df):
        
    mood_prefix = 'mood/theme---'
    
    
    mood_values = df.values.flatten() 
    filtered_mood_values = [value.split('-')[3] for value in mood_values if isinstance(value, str) and value.startswith(mood_prefix)]
    
    
    unique_mood_values = set(filtered_mood_values)
    
    
    #print("Unique mood prefixes:", unique_mood_values)
    #print("Number of unique mood prefixes:", len(unique_mood_values))
    return list(unique_mood_values)

In [None]:
raw_unique_moods = generate_unique_moods(df_raw_filtered)
raw_cleantags_unique = generate_unique_moods(df_raw_cleantags_filtered)

In [None]:
print(df_raw_filtered[df_raw_filtered.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:
raw_unique_moods

In [None]:
!pip install gensim


In [None]:
import numpy as np
import gensim
from sklearn.metrics.pairwise import cosine_similarity

In [None]:

def load_glove_embeddings(file_path):
    glove_model = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            glove_model[word] = vector
    return glove_model


glove_file_path = '/kaggle/input/glove6b100d/glove.6B.100d.txt'
glove_embeddings = load_glove_embeddings(glove_file_path)


#emotions = ['happy', 'sad', 'calm', 'energetic']
emotions = ['happy', 'calm', 'tense', 'energetic']



def get_vector(word, glove_model):
    return glove_model.get(word)


def classify_tags(raw_unique_moods, emotions, glove_model):
    classified_tags = {emotion : [] for emotion in emotions}
    notfound_tags = []
    
    
    emotion_vectors = {emotion: get_vector(emotion, glove_model) for emotion in emotions}
    
    
    for tag in raw_unique_moods:
        tag_vector = get_vector(tag, glove_model)
        
        if tag_vector is not None:
            similarities = {emotion: cosine_similarity([tag_vector], [emotion_vectors[emotion]])[0][0] for emotion in emotions}
            
            best_emotion = max(similarities, key=similarities.get)
            
            classified_tags[best_emotion].append(tag)
        else:
            print(f"Tag '{tag}' not found in GloVe embeddings.")
            notfound_tags.append(tag)
    
    return classified_tags, notfound_tags


classified_tags, unclassified_tags = classify_tags(raw_unique_moods, emotions, glove_embeddings)

In [None]:

for emotion, tags_list in classified_tags.items():
    print(f"{emotion.capitalize()}: {tags_list}\n")

In [None]:
print(df_raw_filtered[df_raw_filtered.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:
print("Length of unclassified tags:", len(unclassified_tags))

In [None]:
map_unclassified = {'silentfilm': 'silent',
                    'melancolic':'melancholic',
                    'energic':'energetic',
                    'festif': 'festive',
                    'epicmusic':'epic',
                    'lovemusic':'love',
                    'sciencefiction':'scifi',
                    'trailermusic':'trailer',
                    'relaxingmusic':'relaxing',
                    'planant':'spacy',
                    'oldschool':'retro',
                    'filmmusic':'film',
                    'folkrock': 'folk',
                    'filmscore': 'film'
                   }

updated_words_list = [map_unclassified.get(word, word) for word in raw_unique_moods]

print(updated_words_list)

In [None]:
print('trailermusic' in updated_words_list)
print('trailermusic' in raw_unique_moods)

In [None]:
new_classified_tags, new_unclassified_tags = classify_tags(updated_words_list, emotions, glove_embeddings)

In [None]:
print(len(new_unclassified_tags))

In [None]:
print(df_raw_filtered[df_raw_filtered.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:

for emotion, tags_list in new_classified_tags.items():
    print(f"{emotion.capitalize()}: {tags_list}\n")

In [None]:
!rm -rf /kaggle/working/*

In [None]:
category_dict = new_classified_tags
label_to_category = {label: category for category, labels in category_dict.items() for label in labels}

In [None]:
print(label_to_category['trailer'])

In [None]:
print(df_raw_filtered[df_raw_filtered.PATH=='94/7394.mp3']['TAGS-17'])
print(df_raw_filtered[df_raw_filtered.PATH=='48/948.mp3']['TAGS-6'])

In [None]:
df_raw_filtered = df_raw_filtered.astype(str)

In [None]:
subset_df = df_raw_filtered.copy()

In [None]:

mood_prefix = 'mood/theme---'
def replace_mood_labels(cell_value):
   
    check_cell = cell_value
    if isinstance(cell_value, pd.Series):
        check_cell = cell_value.tolist()[0]
        
    if isinstance(cell_value, object) and check_cell.startswith(mood_prefix):
        mood_label = check_cell.split(mood_prefix)[1]
        if mood_label in map_unclassified:
            mood_label = map_unclassified[mood_label]
            
          
        category_label = label_to_category.get(mood_label)
        
        if category_label is None:
            category_label = map_unclassified.get(mood_label, 'unknown')  
        return f"{mood_prefix}{category_label}"  
    return cell_value  


In [None]:
#subset_df10 = subset_df.iloc[:100]
#subset_df10.shape
#for col in subset_df10.columns:
#sample_df = subset_df[subset_df.PATH=='94/7394.mp3'].apply(replace_mood_labels)
sample_df = subset_df[subset_df.PATH=='94/7394.mp3'].apply(lambda x: replace_mood_labels(x))

In [None]:
print(sample_df[subset_df.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:

def update_df_new_moods(dfx):
    for col in dfx.columns:
        #if df[col].str.startswith(mood_prefix).any(): 
        dfx[col] = dfx[col].apply(lambda x: replace_mood_labels(x))
    return dfx


modified_raw_moods = update_df_new_moods(subset_df)
#print(modified_raw_moods)

In [None]:
print(modified_raw_moods[modified_raw_moods.PATH=='94/7394.mp3']['TAGS-17'])
print(df_raw_filtered[df_raw_filtered.PATH=='94/7394.mp3']['TAGS-17'])

In [None]:
modified_raw_moods

In [None]:
def get_cell_indices(dfx):
    matching_indices = [] 

    def check_cell_value(row):
        
        for col in dfx.columns:
            cell_value = row[col]
            if isinstance(cell_value, str) and cell_value.startswith('mood/theme---'):
                
                matching_indices.append((row.name, col, cell_value))

    
    dfx.apply(check_cell_value, axis=1)

    return matching_indices


In [None]:
indices = get_cell_indices(modified_raw_moods)
moods = [mood[2] for mood in indices]

print("Matching moods:", set(moods))

In [None]:

result_dict = {}
for t in indices:
    key = t[2]  
    value = (t[0], t[1])  
    if key not in result_dict:
        result_dict[key] = []  
    result_dict[key].append(value)  


#print(result_dict)


In [None]:
result_dict.keys()

In [None]:
###DEBUG PURPOSE ONLY, RUN IFF ABOVE HAS >4 LABELS
for index, row in modified_raw_moods.iterrows():
    if row.name == 21375:  
        
        extracted_row = modified_raw_moods.loc[row.name]
        print(f"Extracted Row:\n{extracted_row}")

In [None]:
result_dict['mood/theme---epic']

In [None]:
def extract_final_mood(row):
    
    #print("Row:", row)
    mood_columns = [col for col in row.index if row[col] is not None and 'mood/theme---' in str(row[col])]
    #print("Mood columns:", mood_columns)
    if mood_columns:
        
        first_mood = row[mood_columns[0]].split('mood/theme---')[1]
        #print("first_mood:", first_mood)
        return first_mood
    else:
        
        return None


df2 = modified_raw_moods.copy()
df2['final_mood'] = df2.apply(extract_final_mood, axis=1)


final_df = df2[['PATH', 'final_mood']]


print(final_df)

In [None]:
final_df['final_mood'].value_counts()

In [None]:
keyword = 'unknown'
result = final_df[final_df['final_mood'].str.contains(keyword, case=False, na=False)]
print(result)

In [None]:
final_df.to_csv('/kaggle/working/final_mood_labels.tsv', sep='\t', index=False)