Source : https://www.kaggle.com/datasets/diamondsnake/eurovision-song-contest-data?resource=download



In [3]:
#import the libraries
import requests
import os
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
!pip install xlrd

In [31]:
songs_df = pd.read_csv('./data/song_data_0104.csv', encoding="ISO-8859-1")

#changing loudness category into just integer values
songs_df['loudness_db'] = songs_df['loudness'].str.replace('dB', ' ').str.strip()

#dropping all columns that only have 'unknown' as their single value & original loudness
songs_df = songs_df.drop(['race', 'age', 'selection', 'release_date', 'key_change_10', 'loudness'], axis='columns')


In [32]:
songs_df[songs_df['artist_name'].str.lower() == 'let 3']


Unnamed: 0,year,semi_final,semi_draw_position,final_draw_position,country,artist_name,song_name,language,style,direct_qualifier_10,...,final_jury_votes,final_place,final_total_points,semi_place,semi_televote_points,semi_jury_points,semi_total_points,favourite_10,host_10,loudness_db
6,2023,1,7,25,Croatia,Let 3,Mama C!,Croatian,Pop,0,...,2.0,13.0,123.0,8.0,76.0,,76.0,0,0,7


In [33]:
songs_df[songs_df['song_name'] == 'Mama ŠČ!']

Unnamed: 0,year,semi_final,semi_draw_position,final_draw_position,country,artist_name,song_name,language,style,direct_qualifier_10,...,final_jury_votes,final_place,final_total_points,semi_place,semi_televote_points,semi_jury_points,semi_total_points,favourite_10,host_10,loudness_db


In [34]:
# Replace the song name
songs_df.loc[songs_df['song_name'] == 'Mama C!', 'song_name'] = 'Mama ŠČ!'

In [35]:
songs_df.head()

Unnamed: 0,year,semi_final,semi_draw_position,final_draw_position,country,artist_name,song_name,language,style,direct_qualifier_10,...,final_jury_votes,final_place,final_total_points,semi_place,semi_televote_points,semi_jury_points,semi_total_points,favourite_10,host_10,loudness_db
0,2023,1,1,20.0,Norway,Alessandra,Queen of Kings,English,Pop,0,...,11.0,5.0,268.0,6.0,102.0,,102.0,0,0,10
1,2023,1,2,,Malta,The Busker,Dance (Our Own Party),English,Pop,-,...,,,,15.0,3.0,,3.0,0,0,6
2,2023,1,3,5.0,Serbia,Luke Black,Samo mi se spava,"Serbian, English",Pop,0,...,6.0,24.0,30.0,10.0,37.0,,37.0,0,0,10
3,2023,1,4,,Latvia,Sudden Lights,Aija,English,Rock,-,...,,,,11.0,34.0,,34.0,0,0,8
4,2023,1,5,2.0,Portugal,Mimicat,Ai coração,Portuguese,Pop,0,...,9.0,23.0,59.0,9.0,74.0,,74.0,0,0,8


In [36]:
contestants_df = pd.read_csv('./data/contestants.csv', encoding="ISO-8859-1")

In [39]:
contestants_df['song']
songs_df['song_name']


0                                Refrain
1                  De Vogels Van Holland
2       Messieurs Les NoyÃ©s De La Seine
3        Im Wartesaal Zum GroÃen GlÃ¼ck
4                         Le Temps Perdu
                      ...               
1729                               AijÄ
1730                          We Are One
1731                    Burning Daylight
1732                        Tell Me More
1733               Dance (Our Own Party)
Name: song, Length: 1734, dtype: object

In [40]:
# Normalize contestants_df['song']
contestants_df['song_clean'] = contestants_df['song'].str.lower().str.strip()

# Normalize songs_df['song_name']
songs_df['song_name_clean'] = songs_df['song_name'].str.lower().str.strip()

In [44]:
contestants_df['song_clean']

0                                refrain
1                  de vogels van holland
2       messieurs les noyã©s de la seine
3        im wartesaal zum groãen glã¼ck
4                         le temps perdu
                      ...               
1729                               aijä
1730                          we are one
1731                    burning daylight
1732                        tell me more
1733               dance (our own party)
Name: song_clean, Length: 1734, dtype: object

In [45]:
# Get matches (songs present in both DataFrames)
matching_songs = contestants_df[contestants_df['song_clean'].isin(songs_df['song_name_clean'])]
print("Matching songs:", matching_songs[['song']].drop_duplicates())

Matching songs:                        song
324               Mata Hari
358               Boom Boom
527        Soldiers Of Love
656               This Time
706                    Amen
...                     ...
1714      Breaking My Heart
1715           Tell Me More
1716  Dance (Our Own Party)
1717    D.G.T. (Off and On)
1718         Like An Animal

[499 rows x 1 columns]


In [42]:
# Get non-matches (songs in contestants_df but not in songs_df)
non_matching_songs = contestants_df[~contestants_df['song_clean'].isin(songs_df['song_name_clean'])]
print("Non-matching songs:", non_matching_songs[['song']].drop_duplicates())

Non-matching songs:                                   song
0                              Refrain
1                De Vogels Van Holland
2     Messieurs Les NoyÃ©s De La Seine
3      Im Wartesaal Zum GroÃen GlÃ¼ck
4                       Le Temps Perdu
...                                ...
1694                        Mama Å Ä!
1697                       Ãvidemment
1699                  Soarele Èi Luna
1704                      Ai CoraÃ§Ã£o
1709                             AijÄ

[1179 rows x 1 columns]


In [43]:
contestants_df['is_matching'] = contestants_df['song_clean'].isin(songs_df['song_name_clean'])
print(contestants_df[['song', 'is_matching']])

                                  song  is_matching
0                              Refrain        False
1                De Vogels Van Holland        False
2     Messieurs Les NoyÃ©s De La Seine        False
3      Im Wartesaal Zum GroÃen GlÃ¼ck        False
4                       Le Temps Perdu        False
...                                ...          ...
1729                             AijÄ        False
1730                        We Are One         True
1731                  Burning Daylight         True
1732                      Tell Me More         True
1733             Dance (Our Own Party)         True

[1734 rows x 2 columns]


In [63]:
# Pushing these tables to the AWS

from sqlalchemy import create_engine, types
from sqlalchemy import text # to be able to pass string


# Let's load values from the .env file
from dotenv import dotenv_values

config = dotenv_values()

# define variables for the login
pg_user = config['POSTGRES_USER']  # align the key label with your .env file !
pg_host = config['POSTGRES_HOST']
pg_port = config['POSTGRES_PORT']
pg_db = config['POSTGRES_DB']
pg_schema = config['POSTGRES_SCHEMA']
pg_pass = config['POSTGRES_PASS']


url = f'postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}'

engine = create_engine(url, echo=False)

In [64]:
songs_df.to_sql(name = 'songs', 
                       con = engine, 
                       schema = pg_schema, # pandas is allowing to specify, in which schema the table shall be created
                       if_exists='replace', 
                       index=False
                      )

565