# Cleaning and filtering 26k Pixabay dataset

- name of the file : Bon _Export_20k_full_tags.csv
- path :  S.A.M/Modèles/data/



## 1. Load GDrive and dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

csv_path = "/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/Bon _Export_20k_full_tags.csv"

df = pd.read_csv(csv_path)
df.head()

Unnamed: 0,Title,Title_URL,Artist Name,Artist Link,Duration,Category,Keyword1,Category.1,Keyword2,Other keywords,Other keywords.1
0,Ethereal Horizons,https://pixabay.com/music/upbeat-ethereal-hori...,Top-Flow,https://pixabay.com/users/top-flow-28521292/,2:18,Genre,Upbeat Electronic Pop Corporate,Mood,Restless Bright Dreamy Laid Back Uplifting Ene...,background\ncorporate,background corporate ambient minimalist inspir...
1,Ethereal Horizons,https://pixabay.com/music/upbeat-ethereal-hori...,Top-Flow,https://pixabay.com/users/top-flow-28521292/,2:18,Movement,Chasing Smooth Running Elegant Medium Fast,Theme,Music For Videos Music For Youtube Videos Vlog...,background\ncorporate,background corporate ambient minimalist inspir...
2,Let's meet Michelle Synthpop Background music ...,https://pixabay.com/music/dance-letx27s-meet-m...,White_Records,https://pixabay.com/users/white_records-32584949/,0:54,Genre,Dance Upbeat Electronic Synth Pop Intro/Outro,Mood,Restless Bright Uplifting Energetic,background,background background music synthpop music mus...
3,Let's meet Michelle Synthpop Background music ...,https://pixabay.com/music/dance-letx27s-meet-m...,White_Records,https://pixabay.com/users/white_records-32584949/,0:54,Movement,Chasing Running Fast Heavy & Ponderous,Theme,Music For Videos Music For Youtube Videos Vlog...,background,background background music synthpop music mus...
4,Let's meet Michelle. Synthpop version Backgrou...,https://pixabay.com/music/dance-letx27s-meet-m...,White_Records,https://pixabay.com/users/white_records-32584949/,1:38,Genre,Dance Upbeat Electronic Synth Pop,Mood,Bright Uplifting Energetic Restless,background,background background music synthpop music mus...


## 2. Process dataset

In [None]:
# Define the function to perform all operations
def process_dataframe(df):
    # Define the function to update Genre and Mood
    def update_genre_mood(row):
        row['Genre'] = row['Keyword1']
        row['Mood'] = row['Keyword2']
        return row

    # Apply the function to update Genre and Mood
    df = df.apply(update_genre_mood, axis=1)

    # Define the function to update Movement and Theme
    def update_movement_theme(df):
        for i in range(len(df) - 1):
            df.at[i, 'Movement'] = df.at[i + 1, 'Keyword1']
            df.at[i, 'Theme'] = df.at[i + 1, 'Keyword2']
        return df

    # Apply the function to update Movement and Theme
    df = update_movement_theme(df)

    # Drop unnecessary columns
    df = df.drop(columns=['Category', 'Keyword1', 'Category.1', 'Keyword2'])

    # Remove rows with odd indices
    df = df.iloc[::2].reset_index(drop=True)

    # Reorder columns to place 'Other Keyword' and 'Other Keywords.1' as the last columns
    new_order = ['Title', 'Title_URL', 'Artist Name', 'Artist Link', 'Duration', 'Genre', 'Mood', 'Movement', 'Theme', 'Other keywords', 'Other keywords.1']
    df = df[new_order]

    return df

# Function to identify rows with durations in their names
def has_duration(name):
    # Regular expression to match patterns like '38s', '43 second', '43 seconds'
    return bool(re.search(r'\d+s\b|\bseconds?\b', name))

# Apply the processing function
df_processed = process_dataframe(df)

# Filter out rows with durations in their names
df_processed = df_processed[~df_processed['Title'].apply(has_duration)]

In [None]:
df_processed.head(2)

Unnamed: 0,Title,Title_URL,Artist Name,Artist Link,Duration,Genre,Mood,Movement,Theme,Other keywords,Other keywords.1
0,Ethereal Horizons,https://pixabay.com/music/upbeat-ethereal-hori...,Top-Flow,https://pixabay.com/users/top-flow-28521292/,2:18,Upbeat Electronic Pop Corporate,Restless Bright Dreamy Laid Back Uplifting Ene...,Chasing Smooth Running Elegant Medium Fast,Music For Videos Music For Youtube Videos Vlog...,background\ncorporate,background corporate ambient minimalist inspir...
2,Let's meet Michelle. Synthpop version Backgrou...,https://pixabay.com/music/dance-letx27s-meet-m...,White_Records,https://pixabay.com/users/white_records-32584949/,1:38,Dance Upbeat Electronic Synth Pop,Bright Uplifting Energetic Restless,Chasing Running Heavy & Ponderous Fast,Music For Videos Music For Youtube Videos Vlog...,background,background background music synthpop music mus...


## 3. Create new datasets

In [None]:
# Function to create a DataFrame with a specified genre and its variations
def create_genre_df(df, genre_variations, columns, num_rows=500):
    # Ensure columns do not contain NaN values
    df = df[~df[columns].isna().any(axis=1)]

    # Search in 'Genre' column first
    pattern = '|'.join(genre_variations)
    genre_rows = df[df['Genre'].str.contains(pattern, case=False, regex=True)]

    # If we have enough rows, return them
    if len(genre_rows) >= num_rows:
        return genre_rows.head(num_rows)

    # If not enough, search in other columns
    remaining_rows_needed = num_rows - len(genre_rows)
    other_columns_rows = df[columns].apply(lambda x: x.str.contains(pattern, case=False, regex=True)).any(axis=1)

    # Exclude already selected rows from other columns
    additional_rows = df[other_columns_rows & ~df.index.isin(genre_rows.index)]

    # Combine genre rows with additional rows
    final_rows = pd.concat([genre_rows, additional_rows]).head(num_rows)

    return final_rows

# Define variations of the genres you want to search for
classical_variations = ['classical', 'classical music', 'classic']
lofi_variations = ['lofi', 'chill lofi']

# List of columns to search in
columns_to_search = ['Mood', 'Movement', 'Theme', 'Other keywords', 'Other keywords.1']

# Create DataFrames for classical and lofi genres
classical_df = create_genre_df(df_processed, classical_variations, columns_to_search, num_rows=500)
lofi_df = create_genre_df(df_processed, lofi_variations, columns_to_search, num_rows=500)

# Save the DataFrames to CSV files
classical_df.to_csv('500_classical_tracks.csv', index=False)
lofi_df.to_csv('500_lofi_tracks.csv', index=False)