# Code for Testing how to recognize songs

Notes: moviepy requires installation of ffmpeg, you can download from [this link](https://ffmpeg.org/download.html). You will also need ffprobe to run ShazamAPI, which can be downloaded from the same link. Move the executable to your usr/local/bin file

* We use moviepy to convert the file to mp3
* We pass the mp3 to ShazamAPI to get the relevant information

Author: Audrey Yip and Jyontika Kapoor

In [1]:
# !pip3 install moviepy
# !pip3 install ShazamAPI

In [3]:
import os
import json
from moviepy.editor import *
from ShazamAPI import Shazam
import pandas as pd

#### First, we need to convert the .mp4 files to .mp3
##### This is so it is compatible with the Shazam API 

In [3]:
folder_path = "/Users/jyontika/Documents/GitHub/CS315-Final-Project/data-collection/videos"

input_path = os.path.join(folder_path, "videos_mp4")

# create a new folder for .mp3 files
output_folder = os.path.join(folder_path, "videos_mp3")
os.makedirs(output_folder, exist_ok=True)

#initalize counter vars
num_original_files = 0
num_converted_files = 0
missing_files = []

In [7]:
# # no need to run this again - we have all the .mp3 now

# #iterate through each .mp4
# for filename in os.listdir(input_path):
#     if filename.endswith(".mp4"):
#         num_original_files += 1
        
#         video_path = os.path.join(input_path, filename)
#         output_path = os.path.join(output_folder, filename.replace(".mp4", ".mp3"))
        
#         # load the video
#         video = VideoFileClip(video_path)
        
#         # convert video to audio and save as .mp3
#         video.audio.write_audiofile(output_path)
        
#         num_converted_files += 1

# print("Conversion complete!")

In [6]:
# #Double check the numbers of files are the same

# num_files_original = len([name for name in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, name))])

# # Count the number of files in the new folder
# num_files_converted = len([name for name in os.listdir(output_folder) if os.path.isfile(os.path.join(output_folder, name))])


### Now we can use Shazam API on these .mp3 files

In [1]:
# Function to extract song information using Shazam API

def extract_song_info(mp3_file_content):
    try:
        shazam = Shazam(mp3_file_content)
        recognize_generator = shazam.recognizeSong()
        song_info = next(recognize_generator)
        return song_info
    except json.JSONDecodeError:
        return None
        

In [9]:
cwd = os.getcwd()
mp3_folder = f'{cwd}/../data-collection/videos_mp3/'
files = os.listdir(mp3_folder)
print(files)


['share_video_7309247847898090794_.mp3', 'share_video_7333834299415760171_.mp3', 'share_video_7329274198096973099_.mp3', 'share_video_7330250329839160607_.mp3', 'share_video_7339685917759769899_.mp3', 'share_video_7329990500528753953_.mp3', 'share_video_7325577933299485958_.mp3', 'share_video_7324083003284573482_.mp3', 'share_video_7306802296640326945_.mp3', 'share_video_7325879255898934561_.mp3', 'share_video_7303373812387679490_.mp3', 'share_video_7306351228974288159_.mp3', 'share_video_7339675920799730987_.mp3', 'share_video_7334356059106053409_.mp3', 'share_video_7341080338803821866_.mp3', 'share_video_7340493999309933867_.mp3', 'share_video_7339691972225010975_.mp3', 'share_video_7326288796163607841_.mp3', 'share_video_7318071341666618656_.mp3', 'share_video_7330409084144979243_.mp3', 'share_video_7281505949615918382_.mp3', 'share_video_7324469503927061803_.mp3', 'share_video_7341498941852159264_.mp3', 'share_video_7302616034597555502_.mp3', 'share_video_7331230833476390175_.mp3',

In [17]:
## testing this function out
#want to compare structure of data of when there is a Shazam API match vs no match

mp3_test = open('/Users/jyontika/Documents/GitHub/CS315-Final-Project/data-collection/videos/videos_mp3/share_video_6958280269531057413_.mp3', 'rb').read()

no_match = extract_song_info(mp3_test)
#no_match

mp3_test2 = open('test_sound_.mp3', 'rb').read()
match_found = extract_song_info(mp3_test2)
#match_found


In [33]:

mp3_test = open('/Users/jyontika/Documents/GitHub/CS315-Final-Project/data-collection/videos/videos_mp3/share_video_7319250533057875232_.mp3', 'rb').read()

no_match = extract_song_info(mp3_test)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [32]:
# empty lists 
no_song_found = [] 
songs_info = []

mp3_folder = output_folder


# loop through each file in the folder
for filename in os.listdir(mp3_folder):
    if filename.endswith(".mp3"):
        mp3_path = os.path.join(mp3_folder, filename)
        mp3_file_content = open(mp3_path, 'rb').read()
        
        print("Processing file:", filename)
        
        
        # use function to extract song data
        song_info = extract_song_info(mp3_file_content)
        if song_info is None:
            print("Error: Failed to decode JSON response for file:", filename)
            continue  # Skip to the next iteration of the loop or handle the error as needed
        
        # check if there are any matches
        if len(song_info[1]['matches']) == 0:

            # if no matched, add filename to no_song_found DF
            no_song_found.append({'File_Name': filename})

        else:
            # if matches found, extract relevant information
            title = song_info[1]['track']['title']
            artist = song_info[1]['track']['subtitle']
            subject = song_info[1]["track"]["share"]["subject"]

              # Check if 'actions' key exists
            if 'actions' in song_info[1]['track']['hub']:
                
                # Check if there are any actions available
                if len(song_info[1]['track']['hub']['actions']) > 1:
                    preview_link = song_info[1]['track']['hub']['actions'][1]['uri']
                else:
                    preview_link = None

            # If 'actions' key doesn't exist, look for preview link in other possible locations
            else:
                # Check if preview link exists in 'share' key
                if 'href' in song_info[1]['track']['share']:
                    preview_link = song_info[1]['track']['share']['href']
                # If preview link not found, set it to None
                else:
                    preview_link = None

            ###preview_link = song_info[1]['track']['hub']['actions'][1]['uri']
            
            # append information to the songs_info list
            songs_info.append({'Title': title, 'Artist': artist, 'Subject': subject, 'Preview_Link': preview_link})

print("Song information extraction complete!")


Processing file: share_video_7309247847898090794_.mp3
Processing file: share_video_7333834299415760171_.mp3
Processing file: share_video_7329274198096973099_.mp3
Processing file: share_video_7330250329839160607_.mp3
Processing file: share_video_7339685917759769899_.mp3
Processing file: share_video_7329990500528753953_.mp3
Processing file: share_video_7325577933299485958_.mp3
Processing file: share_video_7324083003284573482_.mp3
Processing file: share_video_7306802296640326945_.mp3
Processing file: share_video_7325879255898934561_.mp3
Processing file: share_video_7303373812387679490_.mp3
Processing file: share_video_7306351228974288159_.mp3
Processing file: share_video_7339675920799730987_.mp3
Processing file: share_video_7334356059106053409_.mp3
Processing file: share_video_7341080338803821866_.mp3
Processing file: share_video_7340493999309933867_.mp3
Processing file: share_video_7339691972225010975_.mp3
Processing file: share_video_7326288796163607841_.mp3
Processing file: share_video

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:
song_df = pd.DataFrame(songs_info)
no_songs_df = pd.DataFrame(no_song_found)

#### Below is Audrey's code, testing Shazam

## Testing a Different Loop (this one works)

In [1]:
import os

# get all mp3s
cwd = os.getcwd()
mp3_folder = f'{cwd}/../data-collection/videos_mp3/'
mp3_files = os.listdir(mp3_folder)

mp3_files_filtered = [file for file in mp3_files if file.endswith(".mp3")]

print(len(mp3_files_filtered))



1049


In [2]:
import os
import csv
import time
from ShazamAPI import Shazam
from json.decoder import JSONDecodeError

# Define CSV file path
csv_file_path = "songs_info.csv"
# Define file for ones where song is not found
no_match_file_path = "no_match_songs.txt"

# Initialize lists
no_song_found = []
songs_info = []
decoding_error = []

# Check if CSV file already exists
if os.path.exists(csv_file_path):
    with open(csv_file_path, mode='r') as file:
        csv_reader = csv.DictReader(file)
        # Add existing songs to songs_info to avoid duplication
        for row in csv_reader:
            songs_info.append(row)

# Check if the file for storing skipped songs already exists
if os.path.exists(no_match_file_path):
    with open(no_match_file_path, mode='r') as file:
        # Read the lines from the file and add them to the no_song_found list
        no_song_found.extend(file.read().splitlines())

# Process each file
for i, file in enumerate(mp3_files):
    print(f"\nProcessing {file}, {i+1} of {len(mp3_files)}")

    mp3_path = os.path.join(mp3_folder, file)

    try:
        # Check if the song is already in songs_info
        if any(song['file_name'] == file for song in songs_info):
            print(f"Song {file} already exists in CSV, skipping...")
            continue
        
        # Check if the filename is already in the list of songs where no match was found
        if file in no_song_found:
            print(f"Song {file} already marked as no match, skipping...")
            continue

        # Try recognizing
        mp3_file_content_to_recognize = open(mp3_path, 'rb').read()
        shazam = Shazam(mp3_file_content_to_recognize)
        recognize_generator = shazam.recognizeSong()

        # Get the first recognized song
        first_song = next(recognize_generator)

        if first_song[1]['matches'] == []:
            no_song_found.append(file)
            print(f"No song found for {file}")

            # Write the song filename to the file for storing skipped songs
            with open(no_match_file_path, mode='a') as no_match_file:
                no_match_file.write(file + "\n")

        else:
            # Get title, artist, and subject
            song = first_song[1]["track"]["title"]
            artist = first_song[1]["track"]["subtitle"]
            track_subject = first_song[1]["track"]["share"]["subject"]

            # Add song information to the list
            song_dict = {
                'file_name': file,
                'track_name': song,
                'artist': artist,
                'track_subject': track_subject
            }
            songs_info.append(song_dict)

            # Print song information
            print("Song name: ", song)
            print("Artist name: ", artist)
            print("Song subject: ", track_subject)

            # Write the song information to CSV
            with open(csv_file_path, mode='a', newline='') as csv_file:
                fieldnames = ['file_name', 'track_name', 'artist', 'track_subject']
                writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
                if os.stat(csv_file_path).st_size == 0:  # Check if file is empty
                    writer.writeheader()  # Write header if file is empty
                writer.writerow(song_dict)

    except JSONDecodeError as e:
        print(f"Error decoding JSON for file {file}: {e}")
        decoding_error.append(file)

    time.sleep(3)

print("All songs processed and written to CSV!")



Processing share_video_7309247847898090794_.mp3, 1 of 1049
Song share_video_7309247847898090794_.mp3 already exists in CSV, skipping...

Processing share_video_7333834299415760171_.mp3, 2 of 1049
Song share_video_7333834299415760171_.mp3 already marked as no match, skipping...

Processing share_video_7329274198096973099_.mp3, 3 of 1049
Song share_video_7329274198096973099_.mp3 already exists in CSV, skipping...

Processing share_video_7330250329839160607_.mp3, 4 of 1049
Song share_video_7330250329839160607_.mp3 already marked as no match, skipping...

Processing share_video_7339685917759769899_.mp3, 5 of 1049
Song share_video_7339685917759769899_.mp3 already exists in CSV, skipping...

Processing share_video_7329990500528753953_.mp3, 6 of 1049
Song share_video_7329990500528753953_.mp3 already exists in CSV, skipping...

Processing share_video_7325577933299485958_.mp3, 7 of 1049
Song share_video_7325577933299485958_.mp3 already marked as no match, skipping...

Processing share_video_73

In [3]:
# check that all were decoded
decoding_error

[]

### OLD CODE

In [11]:
# debug decoding_error

# debug file 
test_song = 'share_video_7337036241898163498_.mp3'
mp3_path_test = os.path.join(mp3_folder, test_song)

mp3_file_content_to_recognize = open(mp3_path_test, 'rb').read()
shazam = Shazam(mp3_file_content_to_recognize)
recognize_generator = shazam.recognizeSong()

# get the first recognized song
first_song = next(recognize_generator)
print(first_song)

# get title
song = first_song[1]["track"]["title"]

# get artist
artist = first_song[1]["track"]["subtitle"]

# subject appears to give both track name and artist
track_subject = first_song[1]["track"]["share"]["subject"]

print("Song name: ", song)
print("Artist name: ", artist)
print("Song subject: ", track_subject)

(5.008, {'matches': [], 'tagid': 'D8A4DC84-E314-401C-8334-4B82AB90CDC0', 'retryms': 9000})


KeyError: 'track'

In [6]:
import os
import csv
from ShazamAPI import Shazam
from json.decoder import JSONDecodeError 
import time

# Define CSV file path
csv_file_path = "songs_info.csv"

# Check if CSV file already exists
if os.path.exists(csv_file_path):
    with open(csv_file_path, mode='r') as file:
        csv_reader = csv.reader(file)
        # Skip the header
        next(csv_reader)
        # Add existing songs to songs_info to avoid duplication
        for row in csv_reader:
            song_dict = {
                'track_name': row[0],
                'artist': row[1],
                'track_subject': row[2]
            }
            songs_info.append(song_dict)

# Process each file
for i, file in enumerate(mp3_files):
    print(f"\nProcessing {file}, {i+1} of {len(mp3_files)}")

    mp3_path = os.path.join(mp3_folder, file)

    try:
        # Check if the song is already in songs_info
        if any(song['track_name'] == file for song in songs_info):
            print(f"Song {file} already exists in CSV, skipping...")
            continue

        # try recognizing 
        mp3_file_content_to_recognize = open(mp3_path, 'rb').read()
        shazam = Shazam(mp3_file_content_to_recognize)
        recognize_generator = shazam.recognizeSong()

        # get the first recognized song
        first_song = next(recognize_generator)

        if first_song[1]['matches'] == []:
            no_song_found.append(file)
            print(f"No song found for {file}")
        else:
            song_dict = {}
            # get title
            song = first_song[1]["track"]["title"]

            # get artist (?)
            artist = first_song[1]["track"]["subtitle"]

            # subject appears to give both track name and artist
            track_subject = first_song[1]["track"]["share"]["subject"]

            song_dict['track_name'] = song
            song_dict['artist'] = artist
            song_dict['track_subject'] = track_subject

            songs_info.append(song_dict)

            print("Song name: ", song)
            print("Artist name: ", artist)
            print("Song subject: ", track_subject)
    
    except JSONDecodeError as e:
        print(f"Error decoding JSON for file {file}: {e}")
        decoding_error.append(file)

    # add a delay between files
    # time.sleep(10)  # Delay for 10 seconds between files

# write songs_info to CSV
with open(csv_file_path, mode='w', newline='') as file:
    fieldnames = ['track_name', 'artist', 'track_subject']
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    writer.writeheader()
    for song_info in songs_info:
        writer.writerow(song_info)



Processing share_video_7309247847898090794_.mp3, 1 of 1049
Error decoding JSON for file share_video_7309247847898090794_.mp3: Expecting value: line 1 column 1 (char 0)

Processing share_video_7333834299415760171_.mp3, 2 of 1049
Error decoding JSON for file share_video_7333834299415760171_.mp3: Expecting value: line 1 column 1 (char 0)

Processing share_video_7329274198096973099_.mp3, 3 of 1049
Error decoding JSON for file share_video_7329274198096973099_.mp3: Expecting value: line 1 column 1 (char 0)

Processing share_video_7330250329839160607_.mp3, 4 of 1049
Error decoding JSON for file share_video_7330250329839160607_.mp3: Expecting value: line 1 column 1 (char 0)

Processing share_video_7339685917759769899_.mp3, 5 of 1049
Error decoding JSON for file share_video_7339685917759769899_.mp3: Expecting value: line 1 column 1 (char 0)

Processing share_video_7329990500528753953_.mp3, 6 of 1049
Error decoding JSON for file share_video_7329990500528753953_.mp3: Expecting value: line 1 colu

KeyboardInterrupt: 

In [10]:
import os
from ShazamAPI import Shazam
from json.decoder import JSONDecodeError 

no_song_found = [] 
songs_info = []
decoding_error = []

for i in range(len(mp3_files)):
    num_files = len(mp3_files)
    file = mp3_files[i]
    print(f"\nProcessing {file}, {i} of {num_files}")
    mp3_path = os.path.join(mp3_folder, file)

    try:
        # try recognizing 
        mp3_file_content_to_recognize = open(mp3_path, 'rb').read()
        shazam = Shazam(mp3_file_content_to_recognize)
        recognize_generator = shazam.recognizeSong()

        # get the first recognized song
        first_song = next(recognize_generator)

        if first_song[1]['matches'] == []:
            no_song_found.append(file)
            print(f"No song found for {file}")
        else:
            song_dict = {}
            # get title
            song = first_song[1]["track"]["title"]

            # get artist (?)
            artist = first_song[1]["track"]["subtitle"]

            # subject appears to give both track name and artist
            track_subject = first_song[1]["track"]["share"]["subject"]

            song_dict['track_name'] = song
            song_dict['artist'] = artist
            song_dict['track_subject'] = track_subject

            songs_info.append(song_dict)

            print("Song name: ", song)
            print("Artist name: ", artist)
            print("Song subject: ", track_subject)
    
    except JSONDecodeError as e:
        print(f"Error decoding JSON for file {file}: {e}")
        decoding_error.append(file)




Processing share_video_7309247847898090794_.mp3, 0 of 1049
Song name:  TAKI TA TRIBALERO (feat. DJ Erandes)
Artist name:  DJ Mecca
Song subject:  TAKI TA TRIBALERO (feat. DJ Erandes) - DJ Mecca

Processing share_video_7333834299415760171_.mp3, 1 of 1049
No song found for share_video_7333834299415760171_.mp3

Processing share_video_7329274198096973099_.mp3, 2 of 1049
Song name:  Montagem Mysterious Game
Artist name:  LXNGVX
Song subject:  Montagem Mysterious Game - LXNGVX

Processing share_video_7330250329839160607_.mp3, 3 of 1049
No song found for share_video_7330250329839160607_.mp3

Processing share_video_7339685917759769899_.mp3, 4 of 1049
Song name:  Dumptruck
Artist name:  Kinfolk Thugs
Song subject:  Dumptruck - Kinfolk Thugs

Processing share_video_7329990500528753953_.mp3, 5 of 1049
Song name:  Old Future
Artist name:  Koday Jackson
Song subject:  Old Future - Koday Jackson

Processing share_video_7325577933299485958_.mp3, 6 of 1049
No song found for share_video_73255779332994

KeyboardInterrupt: 

In [12]:
# debug file 
test_song = 'share_video_7319572010827336993_.mp3'
mp3_path_test = os.path.join(mp3_folder, test_song)

mp3_file_content_to_recognize = open(mp3_path, 'rb').read()
shazam = Shazam(mp3_file_content_to_recognize)
recognize_generator = shazam.recognizeSong()

# get the first recognized song
first_song = next(recognize_generator)
print(first_song)

# get title
song = first_song[1]["track"]["title"]

# get artist
artist = first_song[1]["track"]["subtitle"]

# subject appears to give both track name and artist
track_subject = first_song[1]["track"]["share"]["subject"]

print("Song name: ", song)
print("Artist name: ", artist)
print("Song subject: ", track_subject)

(8.0, {'matches': [], 'tagid': 'DC6E9D98-9B77-4BFE-937E-626F0BF5D582', 'retryms': 9000})


KeyError: 'track'

## Old Testing Code (Audrey)

In [None]:
# testing ShazamAPI

from ShazamAPI import Shazam

mp3_file_content_to_recognize = open('test_sound_.mp3', 'rb').read()

shazam = Shazam(
    mp3_file_content_to_recognize,
    #lang='en',
    #time_zone='Europe/Paris'
)
recognize_generator = shazam.recognizeSong()

# this gets all recognized songs
# while True:
#	print(next(recognize_generator))

# get the first recognized song
first_song = next(recognize_generator)

print(first_song)

# get title
song = first_song[1]["track"]["title"]

# get artist (?)
artist = first_song[1]["track"]["subtitle"]

# subject appears to give both track name and artist
track_subject = first_song[1]["track"]["share"]["subject"]

# downloads preview of the song
song_preview = first_song[1]["track"]["hub"]["actions"][1]["uri"]

print("Song name: ", song)
print("Artist name: ", artist)
print("Song subject: ", track_subject)
print("Link to song preview: ", song_preview)

