# <font color='darkblue'> DATA EXTRACTION - THE NETWORK OF BEATLES' SONGS

## <font color='darkblue'>Analysis of the network of Beatles'Songs: Getting into their mind

In [1]:
#We import all the necessary packages we used for this project

import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import ast
import operator
import powerlaw
from fa2 import ForceAtlas2
import re
from urllib.request import urlopen
import urllib.request as urllib2
import os
from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import community.community_louvain
import matplotlib.cm as cm

###  <font color='darkblue'>Part 1a: Extraction of information from Beatles Official Page.

In [2]:
#We have create a function to obtain the 'HTLM' of Beatles Official Page. 
def extract_html(url):
    page = urlopen(url)
    html_bytes = page.read()
    html = html_bytes.decode("utf-8")
    return html

In [3]:
#We obtain all the list of songs and we stored it in the Variable Songs_Titles
pattern1= 'hreflang="en">(.*?)</a>'
pattern2= 'chronological-date">(.*?)</td>'

songs_titles=[]
song_years=[]

for i in range(0,9): 
    url = 'https://www.thebeatles.com/songs?page='+str(i)
    html=extract_html(url)
    a=re.findall(pattern1,html)
    b=re.findall(pattern2,html)
    
    for el in a:
        el=el.replace('&#039;', '\'')
        songs_titles.append(el)
    
    for el2 in b:
        song_years.append(el2)

print('The official Beatles page contains information of a number of' , len(songs_titles), 'songs.')

The official Beatles page contains information of a number of 301 songs.


In [4]:
data00={'Song':songs_titles,'Year':song_years}
data0=pd.DataFrame(data00)

<font color='darkblue'>Below, we extract the release data of the song in terms of **day, month and year.**

In [5]:
years=[]
months=[]
days=[]

for date in data0['Year']:
    date=date.split(' ')
    years.append(date[3])
    months.append(date[2][:-1])
    days.append(date[1][:-2])

data11={'Song':songs_titles,'Year':years,'Month':months,'Day':days}
data_df=pd.DataFrame(data11)
data_df

Unnamed: 0,Song,Year,Month,Day
0,12-Bar Original,1996,March,18
1,1822!,1994,November,30
2,A Beginning,1996,October,28
3,A Day In The Life,1967,June,1
4,A Hard Day's Night,1964,July,10
...,...,...,...,...
296,You'll Be Mine,1995,November,20
297,You're Going To Lose That Girl,1965,August,6
298,You've Got To Hide Your Love Away,1965,August,6
299,Young Blood,1994,November,30


In [6]:
months_value={'January':1,'February':2,'March':3,'April':4,'May':5,'June':6,
             'July':7,'August':8,'September':9,'October':10,'November':11,'December':12}

months_sorted=data_df.replace({'Month': months_value}).sort_values(by=['Year','Month','Day']).index
data_df=data_df.reindex(months_sorted)
data_df=data_df.reset_index(drop=True)
data_df

Unnamed: 0,Song,Year,Month,Day
0,Love Me Do,1962,October,5
1,P.S. I Love You,1962,October,5
2,Ask Me Why,1963,January,11
3,Please Please Me,1963,January,11
4,A Taste Of Honey,1963,March,22
...,...,...,...,...
296,You Know My Name,2012,December,14
297,You Like Me Too Much,2012,December,14
298,Sie Liebt Dich,2013,February,7
299,Slow Down,2013,February,7


In [7]:
remove_list=[' a ',' in ',' the ',' of ',' to ',' is ',' at ',' for ',' that ',' by ',' as ',' from ',' into ',
             ' on ',' with ',' off ',' this ',' up ',' like ']
remove_list_start=['a-','in-','the-','of-','to-','is-','at-','for-','that-','by-','as-','from-','like-','this-',
                  'with-']
remove_list_end=['-by','-to','-on','-is','-that','-before']
remove_list2=['\'','.','!','(',')',',','/',':']

pattern6='<div class="col-md-6 middle-content border-left border-right"><p>'
pattern7='<figure class="wp-block-table table-expander table table-imported">'

songs_titles2=[]
for title in songs_titles:
    try:
        title=title.lower()
        for el in remove_list2: # delete special characters
            title=title.replace(el,'')
        for el in remove_list: # replace single words with a space
            title=title.replace(el,' ')
        title=title.replace(' ','-')
        title=title.replace('--','-')
        # delete words from the start
        for i in range(0,6):
            if title[:i] in remove_list_start:
                title=title[i:]
        # delete words end
        for i in range(0,8):
            if title[-i:] in remove_list_end:
                title=title[:-i]
        if title[0]=='-':
            title=title[1:]
        url='https://www.thebeatles.com/'
        query=extract_html(url+title)
        # write the extracted text in a .txt file
        file=open('raw_songs/'+title+'.txt','w+')
        file.write(query)
        file.close()
        songs_titles2.append(title)
    except:
        print(title) #To know the songs where the url could not be obtained. 
        

words-love


<font color='darkblue'>Sometimes, the page of the song has a number at the end, being a 0 or 1. This is the case of words-love-0. 

In [8]:
pattern6='<div class="col-md-6 middle-content border-left border-right"><p>'
pattern7='<figure class="wp-block-table table-expander table table-imported">'
remove_list=['<br />','\n','</p>','<p>']

def extract_lyrics(song_path):
    song=open(song_path).read()
    idx_init = re.search(pattern6,song).end()
    idx_final = re.search(pattern7,song).start()
    lyrics=song[idx_init:idx_final]
    for el in remove_list: # delete special characters
        lyrics=lyrics.replace(el,' ')
    return lyrics

<font color='darkblue'>Then, we extract the lyrics of each songs specifying the patterns. Provided that the song has no lyrics, we classified it as an instrumental song and will not be part of our network. 

In [9]:
txt_files = os.listdir('raw_songs')
songs_no_lyrics=[]
for el in txt_files:
    el=el[:-4]
    song_path='raw_songs/'+el+'.txt'
    try:
        lyrics=extract_lyrics(song_path)
        file=open('lyrics_songs/'+el+'.txt','w+')
        file.write(lyrics)
        file.close()
    except:
        songs_no_lyrics.append(el)


In [10]:
lyrics_files = os.listdir('lyrics_songs')
for el in songs_no_lyrics:
    el=el[:-4]
    url='https://www.thebeatles.com/'
    # check if the song has lyrics or not
    for i in range(0,4):
        try:
            query=extract_html(url+el+'-'+str(i))
            idx_init = re.search(pattern6,query).end()
            idx_final = re.search(pattern7,query).start()
            # rewrite the extracted text in a .txt file
            file=open('raw_songs/'+el+'.txt','w+')
            file.write(query)
            file.close()
            print(el, i)
            break
        except:
            pass


In [11]:
print('The official Beatles page contains' , len(lyrics_files)+len(songs_no_lyrics), 'Songs, with',len(lyrics_files),
     'Lyrics Songs and',len(songs_no_lyrics),'Instrumental ones. ') 

The official Beatles page contains 302 Songs, with 198 Lyrics Songs and 104 Instrumental ones. 


###  <font color='darkblue'>Part 1b: Extraction of information from Beatles Wikipages of Songs.

###  <font color='darkblue'>Part 2: Clean Lyrics of Songs.

In [12]:
stop_words = list(set(stopwords.words('english')))
stop_words = [el.replace('\'','') for el in stop_words]
stop_words.append('im')

<font color='darkblue'> A function to **clean the lyrics** is built.

In [13]:
def clean_lyrics(file_path):
    data=open(file_path).read()
    # import WordPunctTokenizer() method from nltk
    # Create a reference variable for Class WordPunctTokenizer
    tk = WordPunctTokenizer()
    # define punctuation
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
    # remove punctuation from the string
    no_punct = ""
    for char in data:
        if char not in punctuations:
            no_punct = no_punct + char
    # Remove all the special characters as \n and single = left
    char=['\n','=']
    for el in char:
        raw=re.sub(el,'',no_punct)
    # remove stop words
    token_txt = tk.tokenize(raw.lower()) # set to lower case
    token_txt = tk.tokenize(raw.lower()) # set to lower case
    token_final = [x for x in token_txt if x not in stop_words and len(x)>2]
    return token_final

In [14]:
lyrics_files=[el[:-4] for el in os.listdir('lyrics_songs')]
lyrics_files.remove('.DS_S') # remove this element that is introduced when using os.listdir

###  <font color='darkblue'>Part 3: Build the Network.

In [15]:
#After cleaning the lyrics of songs, we found the first five most common words in each song. 
#d_top5 is a dictionary with the most common words used for each song. 
d_top5={}
for file in lyrics_files:
    path='lyrics_songs/'+file+'.txt'
    clean=clean_lyrics(path)
    
    top5=[]
    for el in FreqDist(clean).most_common(5):
        top5.append(el[0])
    d_top5[file]=top5


In [21]:
subs={}
for a,b in enumerate(songs_titles2):
    subs[songs_titles[a]]=b

data_df2=data_df.replace({'Song': subs})


Unnamed: 0,Song,Year,Month,Day
0,love-me-do,1962,October,5
1,ps-i-love-you,1962,October,5
2,ask-me-why,1963,January,11
3,please-please-me,1963,January,11
4,taste-honey,1963,March,22
...,...,...,...,...
296,you-know-what-do,2012,December,14
297,you-never-give-me-your-money,2012,December,14
298,sie-liebt-dich,2013,February,7
299,slow-down,2013,February,7


In [26]:
lyrics_bool=[]
for song in list(data_df2['Song']):
    if song in songs_no_lyrics:
        lyrics_bool.append('No')
    else:
        lyrics_bool.append('Yes')
        
data_df2['Lyrics']=lyrics_bool
data_df2.to_csv('Songs_Sorted_Years.csv',index=False)