### 1) Load files and .csv

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

csv_path = "/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/Bon _Export_20k_full_tags.csv"

df = pd.read_csv(csv_path)

classical_df = pd.read_csv('/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/500_classical_tracks.csv')
lofi_df = pd.read_csv('/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/500_lofi_tracks.csv')

#### 1.1) dataset for testing

In [None]:
#test_df = classical_df[:20]
#test_df.to_csv('/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/test_df.csv')

### 2) Create .json files

In [5]:
import pandas as pd
import json
import os
import re
from collections import Counter
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import nltk

# Download the 'wordnet' resource
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = nltk.WordNetLemmatizer()

def extract_ngrams(text, n):
    """
    Extract n-grams from a given text.

    Parameters:
    - text: A string of text.
    - n: The number of words in the n-gram.

    Returns:
    - A list of n-grams.
    """
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words=None)
    analyzer = vectorizer.build_analyzer()
    return analyzer(text)

def is_meaningful_phrase(phrase):
    """
    Check if a phrase is meaningful by verifying its components.

    Parameters:
    - phrase: A string representing the phrase.

    Returns:
    - True if the phrase is meaningful, False otherwise.
    """
    words = phrase.split()
    # Check if the individual words are meaningful and make sense together
    for word in words:
        if not wn.synsets(word):
            return False
    return True

def find_common_phrases(df, columns, min_count=5):
    """
    Find common phrases (bigrams and trigrams) in the specified columns of the dataframe.

    Parameters:
    - df: The input dataframe.
    - columns: List of column names to process.
    - min_count: Minimum count to consider a phrase as common.

    Returns:
    - A set of common phrases.
    """
    phrases = []
    for col in columns:
        for text in df[col].dropna():
            phrases.extend(extract_ngrams(text.lower(), 2))  # Bigrams
            phrases.extend(extract_ngrams(text.lower(), 3))  # Trigrams

    phrase_counts = Counter(phrases)
    common_phrases = {phrase for phrase, count in phrase_counts.items() if count >= min_count and is_meaningful_phrase(phrase)}
    return common_phrases

def process_keywords(row, columns, common_phrases):
    """
    Concatenate and process keywords from specified columns, splitting by newline characters.

    Parameters:
    - row: A pandas Series representing a row in the dataframe.
    - columns: List of column names to process.
    - common_phrases: Set of common phrases to preserve as composite keywords.

    Returns:
    - A string containing concatenated keywords.
    """
    exclude_keywords = {'main', 'title'}
    keywords = []
    composite_keywords = []

    for col in columns:
        if pd.notna(row[col]):
            words = row[col].replace('\n', ' ').lower().split()
            words = [word for word in words if word not in exclude_keywords and len(word) > 2]  # Exclude single chars and common keywords
            if words:
                # Identify and preserve composite keywords
                composite = {phrase for phrase in common_phrases if phrase in row[col].lower()}
                composite_keywords.extend(composite)
                for composite_kw in composite:
                    if composite_kw not in keywords:
                        keywords.append(composite_kw)

                # Add remaining single words
                for word in words:
                    if word not in keywords:
                        keywords.append(word)

    # Create a list of unique keywords and phrases
    unique_keywords = []
    for kw in composite_keywords:
        if all(kw not in other_kw for other_kw in unique_keywords):
            unique_keywords.append(kw)

    unique_keywords.extend([kw for kw in keywords if kw not in unique_keywords])

    # Select keywords by type, ensuring no overlap
    three_word_phrases = [kw for kw in unique_keywords if len(kw.split()) == 3][:4]
    two_word_phrases = [kw for kw in unique_keywords if len(kw.split()) == 2 and all(kw not in three_word for three_word in three_word_phrases)][:3]
    single_words = [kw for kw in unique_keywords if len(kw.split()) == 1 and all(kw not in two_word for two_word in two_word_phrases)][:8]

    # Combine all keywords and join into a single string
    final_keywords = three_word_phrases + two_word_phrases + single_words

    return ', '.join(final_keywords)

def sanitize_filename(name):
    """
    Sanitize the filename to remove or replace invalid characters.

    Parameters:
    - name: The original filename string.

    Returns:
    - A sanitized filename string.
    """
    # Replace invalid characters with an underscore
    return re.sub(r'[<>:"/\\|?*]', '_', name)

def create_json_files(csv_path, output_dir):
    """
    Create JSON files from a CSV dataset.

    Parameters:
    - csv_path: Path to the input CSV file.
    - output_dir: Directory where JSON files will be saved.
    """
    # Read CSV file
    df = pd.read_csv(csv_path)

    # Columns to process
    columns = ['Genre', 'Mood', 'Movement', 'Theme', 'Other keywords', 'Other keywords.1']

    # Find common phrases in the dataset
    common_phrases = find_common_phrases(df, columns)

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Process each row and create JSON files
    for index, row in df.iterrows():
        prompt = process_keywords(row, columns, common_phrases)

        # Use the 'Title' column value as the filename, sanitize it for filesystem
        title = row.get('Title', f'file_{index+1}').strip()
        sanitized_title = sanitize_filename(title)
        file_name = f"{index+1}_{sanitized_title}.json"
        file_path = os.path.join(output_dir, file_name)

        # Prepare the data to be saved in JSON
        data = {"prompt": prompt}

        # Write JSON file
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)

        print(f"Created {file_path}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
# Usage example
csv_path = '/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/500_lofi_tracks.csv'  # Update with your CSV file path
output_dir = '/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_lofi/'  # Update with your desired output directory

create_json_files(csv_path, output_dir)

### Testing new code - 18h39, 06/08/2024

In [38]:
import pandas as pd
import json
import os
import re
from collections import Counter
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
import nltk

# Download the 'wordnet' resource
nltk.download('wordnet')

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def extract_ngrams(text, n):
    """
    Extract n-grams from a given text.

    Parameters:
    - text: A string of text.
    - n: The number of words in the n-gram.

    Returns:
    - A list of n-grams.
    """
    vectorizer = CountVectorizer(ngram_range=(n, n), stop_words=None)
    analyzer = vectorizer.build_analyzer()
    return analyzer(text)

def find_common_phrases(df, columns, min_count=5):
    """
    Find common phrases (bigrams and trigrams) in the specified columns of the dataframe.

    Parameters:
    - df: The input dataframe.
    - columns: List of column names to process.
    - min_count: Minimum count to consider a phrase as common.

    Returns:
    - A set of common phrases.
    """
    phrases = []
    for col in columns:
        for text in df[col].dropna():
            phrases.extend(extract_ngrams(text.lower(), 2))  # Bigrams
            phrases.extend(extract_ngrams(text.lower(), 3))  # Trigrams

    phrase_counts = Counter(phrases)
    common_phrases = {phrase for phrase, count in phrase_counts.items() if count >= min_count}
    return common_phrases

def process_keywords(row, columns, common_phrases):
    """
    Concatenate and process keywords from specified columns, splitting by newline characters.

    Parameters:
    - row: A pandas Series representing a row in the dataframe.
    - columns: List of column names to process.
    - common_phrases: Set of common phrases to preserve as composite keywords.

    Returns:
    - A string containing concatenated keywords.
    """
    exclude_keywords = {'main', 'title'}
    keywords = []
    composite_keywords = []

    for col in columns:
        if pd.notna(row[col]):
            words = row[col].replace('\n', ' ').lower().split()
            words = [word for word in words if word not in exclude_keywords and len(word) > 1]  # Exclude single chars and common keywords
            if words:
                # Identify and preserve composite keywords
                composite = {phrase for phrase in common_phrases if phrase in row[col].lower()}
                composite_keywords.extend(composite)
                for composite_kw in composite:
                    if composite_kw not in keywords:
                        keywords.append(composite_kw)

                # Add remaining single words
                for word in words:
                    if word not in keywords:
                        keywords.append(word)

    # Create a list of unique keywords and phrases
    unique_keywords = []
    for kw in composite_keywords:
        if all(kw not in other_kw for other_kw in unique_keywords):
            unique_keywords.append(kw)

    unique_keywords.extend([kw for kw in keywords if kw not in unique_keywords])

    # Select keywords by type, ensuring no overlap
    three_word_phrases = [kw for kw in unique_keywords if len(kw.split()) == 3][:3]
    two_word_phrases = [kw for kw in unique_keywords if len(kw.split()) == 2 and kw not in three_word_phrases][:3]
    single_words = [kw for kw in unique_keywords if len(kw.split()) == 1 and kw not in two_word_phrases][:4]

    # Combine all keywords and join into a single string
    final_keywords = three_word_phrases + two_word_phrases + single_words

    return ', '.join(final_keywords)

def sanitize_filename(name):
    """
    Sanitize the filename to remove or replace invalid characters.

    Parameters:
    - name: The original filename string.

    Returns:
    - A sanitized filename string.
    """
    # Replace invalid characters with an underscore
    return re.sub(r'[<>:"/\\|?*]', '_', name)

def create_json_files(csv_path, output_dir):
    """
    Create JSON files from a CSV dataset.

    Parameters:
    - csv_path: Path to the input CSV file.
    - output_dir: Directory where JSON files will be saved.
    """
    # Read CSV file
    df = pd.read_csv(csv_path)

    # Columns to process
    columns = ['Genre', 'Mood', 'Movement', 'Theme', 'Other keywords', 'Other keywords.1']

    # Find common phrases in the dataset
    common_phrases = find_common_phrases(df, columns)

    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Process each row and create JSON files
    for index, row in df.iterrows():
        prompt = process_keywords(row, columns, common_phrases)

        # Use the 'Title' column value as the filename, sanitize it for filesystem
        title = row.get('Title', f'file_{index+1}').strip()
        sanitized_title = sanitize_filename(title)
        file_name = f"{index+1}_{sanitized_title}.json"
        file_path = os.path.join(output_dir, file_name)

        # Prepare the data to be saved in JSON
        data = {"prompt": prompt}

        # Write JSON file
        with open(file_path, 'w') as f:
            json.dump(data, f, indent=4)

        print(f"Created {file_path}")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [39]:
# Usage example
csv_path = '/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/500_classical_tracks.csv'  # Update with your CSV file path
output_dir = '/content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_classical2/'  # Update with your desired output directory

create_json_files(csv_path, output_dir)

Created /content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_classical2/1_Inspiring Cinematic Ambient.json
Created /content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_classical2/2_Risk.json
Created /content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_classical2/3_Science Documentary.json
Created /content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_classical2/4_Majestic Voyage.json
Created /content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_classical2/5_Drive to Triumph.json
Created /content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_classical2/6_Motivational Epic Music _ Inspiring Cinematic Background Music.json
Created /content/drive/MyDrive/_France-Paris 2020/Data Scientist/S.A.M/S.A.M/Modèles/data/json_files_classical2/7_Awaken.json
Crea

### 4) Verify first row from table

In [9]:
# Columns to print
columns = ['Genre', 'Mood', 'Movement', 'Theme', 'Other keywords', 'Other keywords.1']

# Print the specified columns for the first row
first_row = lofi_df.loc[0, columns]

# Replace newline characters with spaces
first_row = first_row.apply(lambda x: x.replace('\n', ' ') if isinstance(x, str) else x)

# Save the specified columns for the first row to a file
output_path = '/content/first_row_output.txt'
with open(output_path, 'w') as f:
    f.write("First row values for specified columns:\n")
    for col in columns:
        # Limit the length of the printed content to avoid very long lines
        value = first_row[col]
        if isinstance(value, str) and len(value) > 100:
            value = value[:100] + '...'
        f.write(f"{col}: {value}\n")

print(f"Output saved to {output_path}")

Output saved to /content/first_row_output.txt
