<a href="https://colab.research.google.com/github/athibaut2017/web_scrapper/blob/main/Song_Scrapper_bs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Projet Web scrapper

L'idée est ici d'élaborer un webscrapper qui permette d'aller collecter l'ensemble des paroles des chansons du groupe Gojira, et d'effectuer une courte étude sur les mots qui y reviennent le plus fréquemment. 

# Import des librairies

In [1]:
#librairie qui permet d'effectuer des requêtes HTTP
import requests

import pandas as pd

from bs4 import BeautifulSoup

import yaml

from lxml import etree

# Première étape : gestion des proxies

In [None]:
def proxies_scrapper():
  """Permet de recuperer une liste de proxies gratuits et de les stocker dans un df
  """
  r_proxies = requests.get('https://free-proxy-list.net/')
  proxy_list = pd.read_html(r_proxies.text)[0]
  
  #format de l'adresse url
  proxy_list['url'] = 'http://' + proxy_list['IP Address'] + ':' + proxy_list['Port'].astype(str)
  return proxy_list

df = proxies_scrapper()

In [None]:
# On filtre le df pour n'avoir que les adresses fr
#df_fr = df.where(df.Code == 'FR').dropna()
#print(df_fr)

# On filtre le df pour n'obtenir que les adresses qui supportent le Https
df_clean = df.where(df.Https == 'yes').dropna()

print(df_clean)

          IP Address    Port Code        Country    Anonymity Google Https  \
2     146.158.19.130  8080.0   UZ     Uzbekistan  elite proxy     no   yes   
3      202.40.177.69    80.0   BD     Bangladesh  elite proxy     no   yes   
4        198.27.74.6  9300.0   CA         Canada  elite proxy     no   yes   
7      112.217.162.5  3128.0   KR    South Korea    anonymous     no   yes   
9       23.229.80.23  3129.0   US  United States  elite proxy     no   yes   
..               ...     ...  ...            ...          ...    ...   ...   
292  163.116.158.116  8081.0   US  United States  elite proxy     no   yes   
293    200.119.89.19    80.0   CO       Colombia  elite proxy     no   yes   
295   47.243.167.134  8889.0   HK      Hong Kong    anonymous     no   yes   
297     45.32.69.105  3128.0   US  United States  elite proxy     no   yes   
299      51.79.50.31  9300.0   CA         Canada  elite proxy     no   yes   

    Last Checked                          url  
2    25 secs ag

In [None]:
def define_headers():
  """Permet de definir les header à partir d'un fichier Yaml
  """
  with open("headers.yml") as f_headers:
    header = yaml.safe_load(f_headers)
  return header

In [None]:

def check_proxies(df):
  """Permet de verifier si les proxies contenus dans un df sont valides
  """
  url = 'https://httpbin.org/ip' #renvoie la requête envoyée
  good_proxies = set()
  headers = define_headers()
  for proxy_url in df['url']:
    proxies = { "http": proxy_url,
                "https": proxy_url,
    }
    try:
      r = requests.get(url, headers=headers['Firefox'], proxies=proxies, timeout=3)
      good_proxies.add(proxy_url)
      print('Proxy ' + proxy_url + ' ajouté à la liste des proxies valides')
    except Exception:
      pass
    if len(good_proxies) >= 5:
      break
  return good_proxies

check_proxies(df_clean)

Proxy http://202.40.177.69:80 ajouté à la liste des proxies valides
Proxy http://112.217.162.5:3128 ajouté à la liste des proxies valides
Proxy http://46.4.242.149:3128 ajouté à la liste des proxies valides
Proxy http://217.64.14.171:8080 ajouté à la liste des proxies valides
Proxy http://115.144.101.200:10000 ajouté à la liste des proxies valides


{'http://112.217.162.5:3128',
 'http://115.144.101.200:10000',
 'http://202.40.177.69:80',
 'http://217.64.14.171:8080',
 'http://46.4.242.149:3128'}

In [50]:
def rotate_header_proxies():

  url='https://httpbin.org/ip'

  headers = define_headers()
  good_proxies = check_proxies(df_clean)
  for browser, header in headers.items():
    print(f'On utilise les {browser} headers')
    for proxy_url in good_proxies:
      proxies = proxies = { 
          "http": proxy_url,
          "https": proxy_url,
      }
      try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=2)
        print(r.json())
      except Exception:
        print(f'Proxy {proxy_url} failed, trying an other one')

rotate_header_proxies()

Proxy http://112.217.162.5:3128 ajouté à la liste des proxies valides
Proxy http://207.188.11.31:80 ajouté à la liste des proxies valides
Proxy http://115.144.101.200:10000 ajouté à la liste des proxies valides
Proxy http://47.243.55.21:8080 ajouté à la liste des proxies valides
Proxy http://49.0.2.242:8090 ajouté à la liste des proxies valides
On utilise les Chrome headers
Proxy http://115.144.101.200:10000 failed, trying an other one
Proxy http://207.188.11.31:80 failed, trying an other one
Proxy http://112.217.162.5:3128 failed, trying an other one
Proxy http://49.0.2.242:8090 failed, trying an other one
Proxy http://47.243.55.21:8080 failed, trying an other one
On utilise les Edge headers
Proxy http://115.144.101.200:10000 failed, trying an other one
Proxy http://207.188.11.31:80 failed, trying an other one
Proxy http://112.217.162.5:3128 failed, trying an other one
Proxy http://49.0.2.242:8090 failed, trying an other one
Proxy http://47.243.55.21:8080 failed, trying an other one
O

# Deuxième étape : le scrapper

In [2]:
def define_headers():
  """Permet de definir les header à partir d'un fichier Yaml
  """
  with open("headers.yml") as f_headers:
    header = yaml.safe_load(f_headers)
  return header

In [3]:
#Affiche l'ensemble des morceau
def liste_morceaux(tables):
  for row in tables:
    print(row.text.strip())

#liste_morceaux(tables)

In [4]:
def fetch_all_urls():

  header = define_headers()
  url='http://www.darklyrics.com/g/gojira.html'
  r = requests.get(url, headers=header["Firefox"])
  #print(r.status_code)

  #raw HTML content
  #print(r.content)

  #on spécifie le parser HTLM que l'on veut utiliser pour mettre en forme r.content
  soup = BeautifulSoup(r.content, 'html5lib')
  #print(soup.prettify)

  url_songs = [] # une liste pour stocker les urls d'une chanson
  domain_name = 'http://www.darklyrics.com' #nom de domain du site

  tables = soup.find_all('div', class_="album")
  #si on veut lister les morceaux
  #liste_morceaux(tables)

  for row in tables: 
    a = row.find('a')
    #print(a)
    try:
      if 'href' in a.attrs:
        url_songs.append(domain_name + a.get('href')[2:-2])
    except:
      pass

  return url_songs



In [5]:
url_list = fetch_all_urls()
url_list

['http://www.darklyrics.com/lyrics/gojira/terraincognita.html',
 'http://www.darklyrics.com/lyrics/gojira/thelink.html',
 'http://www.darklyrics.com/lyrics/gojira/frommarstosirius.html',
 'http://www.darklyrics.com/lyrics/gojira/thewayofallflesh.html',
 'http://www.darklyrics.com/lyrics/gojira/lenfantsauvage.html',
 'http://www.darklyrics.com/lyrics/gojira/magma.html',
 'http://www.darklyrics.com/lyrics/gojira/fortitude.html',
 'http://www.darklyrics.com/lyrics/gojira/nonalbumsongs.html']

In [70]:
#url='http://www.darklyrics.com/lyrics/gojira/thelink.html'
def extract_lyrics(url):
  header = define_headers()
  r = requests.get(url, headers=header["Firefox"])
  soup = BeautifulSoup(r.content, 'html.parser')

  #on elimine les parties du HTML dont on ne veut pas
  soup.find('div', class_='note').decompose()
  soup.find('div', class_='thanks').decompose()
  for h3 in soup("h3"): h3.decompose()
  for i in soup("i"): i.decompose()
  for a in soup("a"): a.decompose()
  
  #on extrait les lyrics (pas idéal, il faudrait plutot extraire la section voulue dès le début)
  lyrics = soup.find('div', class_='lyrics')

  #on créé une liste pour ajouter tous les éléments de chaque phrase
  words = []
  for sentence in lyrics.stripped_strings:
    #on nettoie les mots un par un 
    for  word in sentence.split():
      word.lower().strip(',.')
    words.extend(sentence.split())

  #on passe tous les mots de la liste en minuscule
  words = [word.lower() for word in words]

  #permet de relancer la fonction si elle échoue une première fois
  if not words: return extract_lyrics(url)

  return words

In [71]:
words = extract_lyrics(url)
words

In [74]:
def get_all_lyrics():
  """Permet de recuperer les paroles de tous les morceaux de Gojira
  """
  url_list = fetch_all_urls()
  lyrics = []

  for url in url_list:
    lyrics.extend(extract_lyrics(url))

  return lyrics

In [75]:
lyrics = get_all_lyrics()

In [80]:
len(lyrics)

11130