# Groupe 2 : récolte d'informations météorologiques en Ile de France
Sites utilisés : https://www.linternaute.com/ville/ile-de-france/region-11/villes et https://www.meteo-villes.com/previsions-meteo-{ville}-{codepostale}.
Framework : Scrapy.

In [2]:
import scrapy

## 1. Création des classes

Pour commencer, nous créons les classes correspondant aux villes et à la météo.

In [3]:
class VilleItem(scrapy.Item):
    name = scrapy.Field()
    cp = scrapy.Field()

class MeteoItem(scrapy.Item):
    low = scrapy.Field()
    hight = scrapy.Field()
    wind = scrapy.Field()
    rain = scrapy.Field()
    date = scrapy.Field()

Ensuite, une classe représentant la base de données Sqlite avec les méthodes basiques essentielles à son utilisation.

In [10]:
import sqlalchemy as db

class DataBase():
    def __init__(self, name_database="database"):
        self.name = name_database
        self.url = f"sqlite:///{name_database}.db"
        self.engine = db.create_engine(self.url)
        self.connection = self.engine.connect()
        self.metadata = db.MetaData()
        self.table = self.engine.get_table_names() 

    def create_table(self, name_table, **kwargs):
        colums = [db.Column(k, v, primary_key = True) if 'id_' in k else db.Column(k, v) for k,v in kwargs.items()]
        db.Table(name_table, self.metadata, *colums)
        self.metadata.create_all(self.engine)
        print(f"Table : '{name_table}' are created succesfully")

    def read_table(self, name_table, return_keys=False):
        table = db.Table(name_table, self.metadata, autoload=True, autoload_with=self.engine)
        if return_keys:table.columns.keys() 
        else : return table

    def add_row(self, name_table, **kwarrgs):
        name_table = self.read_table(name_table)
        stmt = (
            db.insert(name_table).
            values(kwarrgs)
        )
        self.connection.execute(stmt)
        print(f'Row id added')


    def delete_row_by_id(self, table, id_):
        name_table = self.read_table(name_table) 
        stmt = (
            db.delete(name_table).
            where(students.c.id_ == id_)
            )
        self.connection.execute(stmt)
        print(f'Row id {id_} deleted')

    def select_table(self, name_table):
        name_table = self.read_table(name_table)       
        stm = db.select([name_table])
        return self.connection.execute(stm).fetchall()

##  2. Web Scraping

Dans un premier temps, `VilleSpider` récupère la liste des villes d'Ile de France sur le site l'`internaute.com`. Le nom de la ville ainsi que son code postal seront utils à la recherche de la météo journalière plus tard.

In [11]:
from scrapy import Request
import unidecode

class VilleSpider(scrapy.Spider):
    name = 'ville'
    allowed_domains = ['www.linternaute.com']
    start_urls = ['https://www.linternaute.com/ville/ile-de-france/region-11/villes']

    try:
      base = DataBase('region')
      base.create_table('ville', name=db.String, cp=db.String)
    except:
      pass

    def start_requests(self):
        for url in self.start_urls:
          yield Request(url=url, callback=self.parse_ville)
            
    def parse_ville(self, response):
        liste_villes = response.css('.list--bullet')[1].css('li')

        for ville in liste_villes:
            item = VilleItem() 
            
            #nom de la ville
            try: 
              item['name'] = unidecode.unidecode("-".join(ville.css('a::text').extract()[0].split(' ')[:-1]).lower())
            except:
              item['name'] = 'None'
            
            #code postal de la ville
            try: 
              item['cp'] = ville.css('a::text').extract()[0].split(' ')[-1].replace("(","").replace(")","")
            except:
              item['cp'] = 'None'

            yield item
            self.base.add_row('ville',name=item['name'],cp=item['cp'])

Un fois la liste des villes complétée, on la parcoure pour avoir la météo du jour dans chacune des villes. Sur le site `meteo-villes.com` avec la ville et le code postal dans l'url, on accède à des données sur la température, le vent et les précipitations.

In [15]:
import scrapy
from scrapy import Request
from parapy.items import MeteoItem, VilleItem,DataBase 
import time
import unidecode
from sqlalchemy import create_engine, select
import sqlalchemy as db
from scrapy.crawler import CrawlerProcess


class VilleSpider(scrapy.Spider):
    name = 'city'
    allowed_domains = ['www.linternaute.com']
    start_urls = ['https://www.linternaute.com/ville/ile-de-france/region-11/villes']

    try:
      base = DataBase('region')
      base.create_table('ville', name=db.String, cp=db.String)
    except:
      pass

    def start_requests(self):
        for url in self.start_urls:
          yield Request(url=url, callback=self.parse_ville)
            
    def parse_ville(self, response):
        liste_villes = response.css('.list--bullet')[1].css('li')

        for ville in liste_villes:
            item = VilleItem() 
            
            #nom de la ville
            try: 
              item['name'] = unidecode.unidecode("-".join(ville.css('a::text').extract()[0].split(' ')[:-1]).lower())
            except:
              item['name'] = 'None'
            
            #cp de la ville
            try: 
              item['cp'] = ville.css('a::text').extract()[0].split(' ')[-1].replace("(","").replace(")","")
            except:
              item['cp'] = 'None'

            yield item
            self.base.add_row('ville',name=item['name'],cp=item['cp'])



class MeteoSpider(scrapy.Spider):

    name = 'meteo'
    allowed_domains = ['www.meteo-villes.com']
    
    engine = create_engine("sqlite:///region.db")
    all_ville = DataBase('region').read_table('ville')
    tbl = select([all_ville.columns.cp, all_ville.columns.name])
    connection = engine.connect()
    results = connection.execute(tbl).fetchall()

    start_urls = [f'https://www.meteo-villes.com/previsions-meteo-{n[1]}-{n[0]}' for n in results]
    try:
      base = DataBase('meteo')
      base.create_table('meteo_paris', low_temp=db.String, hight_temp=db.String, vent=db.String, pluie=db.String, date=db.String)
    except:
      pass

    def start_requests(self):
        for url in self.start_urls:
          yield Request(url=url, callback=self.parse_meteo)
            
    def parse_meteo(self, response):
        meteo = response.css('.city-summary-line')[0]

        item = MeteoItem() 
            
        #temperature min
        try: 
            item['low_temp'] = meteo.css(".city-summary-line__temp div::text")[0].extract()
        except:
            item['low_temp'] = 'None'
        
        #temperature max
        try: 
            item['hight_temp'] = meteo.css(".city-summary-line__temp div::text")[1].extract()
        except:
            item['hight_temp'] = 'None'
        
        #vent
        try: 
            item['vent'] = meteo.css(".city-summary-line__wind--speed span::text").extract()[0]
        except:
            item['vent'] = 'None'
        
        #pluie
        try: 
            item['pluie'] = meteo.css(".city-summary-line__rain::text").extract()[0].strip()
        except:
            item['pluie'] = 'None'
        
        #date
        try: 
            item['date'] = str(time.ctime())
        except:
            item['date'] = 'None'
        
        #ville
        try: 
            item['date'] = str(time.ctime())
        except:
            item['date'] = 'None'

        yield item
        self.base.add_row('meteo_paris', low_temp=item['low_temp'], hight_temp=item['hight_temp'], vent=item['vent'], pluie=item['pluie'], date=item['date'])

process = CrawlerProcess()
process.crawl(VilleSpider)
process.crawl(MeteoSpider)
process.start()