# Website Scraper

## Introduction

This notebook presents web scraping tool.

### User Story: Scraping Websites

### Goals

1. To retrieve the timetable from the users each hour.
    1. To convert timetables into CSV with semicolon as a separator:
        - unknown: number
        - class: string
        - debut: timestamp (seconds)
        - fin: timestamp (seconds)
        - matiere: string
        - nom: string
        - prenom: string

## Requirements

### Imports

#### Standard

In [None]:
from collections import deque
from datetime import date, datetime, timedelta
from time import sleep

#### Third-parties

- [requests]('http://docs.python-requests.org/en/master/')
- [pandas]('https://pandas.pydata.org/')
- [pandas]('https://github.com/mkleehammer/pyodbc/wiki/')

In [None]:
import pandas
import requests
import pyodbc

## Arguments

## Parameters

## Classes

### oZe

In [None]:
class oZe:

    protocol = 'https://'
    credential = {
        'username': '',    
        'password': ''
    }
    root = 'enc.hauts-de-seine.fr/'
    paths = {
        None: '',
        'login': 'my.policy', 
        'users': 'v1/users',
        'classes': 'v1/cours/byEleve'
    }
    timeout = 3600
    
    @staticmethod
    def get(url, timeout = datetime.now().timestamp() + timeout):
        '''
            Halting the process
        '''
        while datetime.now().timestamp() < timeout:
            '''
                Opening a session
            '''
            session = requests.Session()
            try:                
                '''
                    Sending a request
                '''
                request = session.get(oZe.url())
                '''
                    Sending the credential
                '''
                request = session.post(oZe.url('login'), data = oZe.credential)
                '''
                    Retrieving data
                '''
                request = session.get(url)
                '''
                    Returning content
                '''
                return pandas.read_json(request.content)
            except Exception as e:           
                print(type(e))   
                print(e.args)
                print(e)
                continue
            finally:
                '''
                    Closing the session
                '''
                session.close()
        '''
            Returning none
        '''
        return None
        
    @staticmethod
    def url(path = None, api = False):
        url = oZe.root
        if api:
            url = 'api-' + url
        return oZe.protocol + url + oZe.paths[path]
    
    @staticmethod
    def users(timeout = datetime.now().timestamp() + timeout):
        '''
            Building the url
        '''
        url = oZe.url('users', True)
        url = url + '?aUai=0921241Z'
        url = url + '&aCategory=Eleve'
        
        '''
            Retrieving the users
        '''
        data = oZe.get(url, timeout)
        '''
            Processing the users
        '''
        data = data[['id','nom', 'prenom']]
        '''
            Return a list of users
        '''
        return data.to_dict('records')
    
    @staticmethod
    def timetable(user, timeout = datetime.now().timestamp() + timeout):
        '''
            Computing the period
        '''
        today = date.today()
        weekday = (today.weekday() + 1) % 7
        begin = today - timedelta(days = weekday)
        end = today + timedelta(days = -weekday, weeks = 1)
        
        '''
            Building the url
        '''
        url = oZe.url('classes', True) 
        url = url + '?ctx_etab=0921241Z&aEleve=' 
        url = url + user['id'] 
        url = url + begin.strftime('&aDateDebut=%Y-%m-%dT23:00:00.000Z') 
        url = url + end.strftime('&aDateFin=%Y-%m-%dT23:00:00.000Z') 
        url = url + '&aDeletedStatus=0'
  
        '''
            Retrieving the timetable
        '''
        data = oZe.get(url, timeout)
        '''
            Processing the timetable
        '''
        if data is None:
            return None
            
        if data.empty:
            return []

        data = data[['classes', 'matieres', 'dateDebut', 'dateFin']]
       
        data['dateDebut'] = pandas.to_datetime(data['dateDebut'], format ='%Y-%m-%dT%H:%M:%SZ').astype('int64')
        data['dateDebut'] = data['dateDebut'].apply(lambda x: x // 1000000000)
        
        data['dateFin'] = pandas.to_datetime( data['dateFin'], format ='%Y-%m-%dT%H:%M:%SZ').astype('int64')
        data['dateFin'] = data['dateFin'].apply(lambda x: x // 1000000000)
        
        data = data['classes'].apply(pandas.Series).merge(data, right_index = True, left_index = True)
        data = data.drop(["classes"], axis = 1)
        data = data.melt(id_vars = ['matieres', 'dateDebut', 'dateFin'], value_name = "class")
        data = data.drop("variable", axis = 1)
        data['class'] = data['class'].apply(lambda x: x['libelle'] if isinstance(x, dict) and 'libelle' in x else None)
        
        data = data['matieres'].apply(pandas.Series).merge(data, right_index = True, left_index = True)
        data = data.drop(["matieres"], axis = 1)
        data = data.melt(id_vars = ['class', 'dateDebut', 'dateFin'], value_name = "matiere")
        data = data.drop("variable", axis = 1)
        data['matiere'] = data['matiere'].apply(lambda x: x['libelle'] if isinstance(x, dict) and 'libelle' in x else None)
        
        data['nom'] = user['nom']
        
        data['prenom'] = user['prenom']

        '''
            Return a list of classes
        '''
        return data.to_dict('records')

### Database

In [None]:
class Database:

    credential = {
        'driver': '{ODBC Driver 17 for SQL Server} ',
        'server': '',
        'database': '',
        'pwd': '',
        'autocommit': True
    }
    table = 'TABLE_SORTIES_JOURNALIERES'
    
    @staticmethod
    def clear(timeout):
        '''
            Halting the process
        '''
        while datetime.now().timestamp() < timeout:            
            try:
                
            connection = pyodbc.connect
            cursor = connection.cursor()
                '''
                    Returning the number of deleted rows
                '''
                return cursor.execute("DELETE FROM " + table).rowcount            
            except Exception as e:           
                print(type(e))   
                print(e.args)
                print(e)
                continue
            finally:
                '''
                    Closing the session
                '''
                session.close()
        '''
            Returning none
        '''
        return None

## Methods

## Program

In [None]:
while True:
    '''
        Computing the halting time
    '''
    halt = datetime.now().timestamp() + oZe.timeout
    '''
        Retrieving the users
    '''
    users = deque(oZe.users(halt))
    '''
        Computing the halting time per request
    '''
    time = (halt - datetime.now().timestamp()) / len(users)
    '''
        ?
    ''' 
    data = pandas.DataFrame()
    '''
        Processing the users        
    ''' 
    while users:
        '''
            Popping a user
        ''' 
        user = users.pop()
        '''
            Retrieving the timetables
        '''      
        classes = oZe.timetable(user, datetime.now().timestamp() + time)  
        '''
            ?
        '''
        data = data.append(pandas.DataFrame.from_dict(classes))
        '''
            Appending a user
        ''' 
        if classes is None:
            users.appendleft(user) 
    '''
        Halting the process
    '''
    while datetime.now().timestamp() < halt:
        try:
            '''
                Persisting the timetables
            '''
            data.to_csv('classes.csv', sep = ';', encoding='utf-8-sig')
            '''
                Continuing the process
            '''
            break
        except PermissionError as e:           
            print(type(e))   
            print(e.args)
            print(e)
            continue
    '''
        Waiting next process
    '''
    time = halt - datetime.now().timestamp()
    if time < 0:
        time = 0
    sleep(time)

## Execution

This web scraping tool is a script. In order to execute this notebook as a script, it is necessary to convert this notebook into executable script.

## Evaluation

## Observations

- Some classes do not have a value, a code or an idetifier.
- oZe uses the Coordinated Universal Time (UTC) for temporal values

## Lessons learned

## References