In [24]:
import requests
import urllib.request
import cv2
from pdf2image import convert_from_path
import pdfplumber
import re
import datetime
import numpy as np
import matplotlib.pyplot as plt
from datetime import *
from math import *
import os
from shutil import *
from pdfminer.layout import LAParams
from pdfminer.converter import PDFResourceManager, PDFPageAggregator
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from ics import Calendar, Event

In [25]:
def cleanup():
    """
    Cleaning tests/ directory
    """
    dir_path = os.path.dirname(os.getcwd())

    rmtree(dir_path + "/tests/output")
    os.mkdir(dir_path + "/tests/output")

In [26]:
def download_file(url,path):
    """
    Downloads file from url
    """
    urllib.request.urlretrieve(url,path)

In [27]:
def showimage(image):
    """
    Shows image passed as argument
    """
    cv2.imshow("",image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [28]:
def findContours(image, retr_mode, minWidth, maxWidth):
    """
    Finds coutours of specific width in an image
    """
    ret, thresh_value = cv2.threshold(image, 130, 255, cv2.THRESH_BINARY_INV)
    kernel = np.ones((5,5), np.uint8)

    dilated_value = cv2.dilate(thresh_value, kernel, iterations = 1)
    contours, hierarchy = cv2.findContours(dilated_value, retr_mode, cv2.CHAIN_APPROX_SIMPLE)
    showimage(dilated_value)

    cordinates = []

    for cnt in contours:
        x,y,w,h = cv2.boundingRect(cnt)

        if w >= minWidth and w <= maxWidth:
            cordinates.append((x,y,w,h))

    return cordinates

In [29]:
def findContoursWeek(image, retr_mode, minWidth, maxWidth):
    """
    Finds coutours of specific width in an image
    """

    contours, hierarchy = cv2.findContours(image, retr_mode, cv2.CHAIN_APPROX_SIMPLE)

    cordinates = []
    showimage(image)
    for cnt in contours:
        x,y,w,h = cv2.boundingRect(cnt)

        if w >= minWidth and w <= maxWidth:
            cordinates.append((x,y,w,h))

    return cordinates

In [30]:
def round_timedelta(td, period):
    """
    Rounds time to the nearest quarter hour
    """
    period_seconds = period.total_seconds()
    half_period_seconds = period_seconds/2

    remainder = td.total_seconds() % period_seconds

    if remainder >= half_period_seconds:
        return timedelta(seconds=td.total_seconds() + (period_seconds - remainder))

    else:
        return timedelta(seconds=td.total_seconds() - remainder)

In [31]:
def getHour(x, index, margin):
    """
    Computes hour according to documents specifications
    """
    x -= (margin+5)

    hour = 0
    warpList = [[33, 169, 177, 150, 205, 139, 165, 182, 180, 168, 180, 167, 160],
                [33, 159, 173, 142, 220, 84, 132, 172, 209, 162, 166, 152, 160],
                [42, 177, 192, 170, 223, 110, 157, 200, 197, 218, 200]]
    warp = warpList[index]

    i = 0
    while x > 0 and i < len(warp):
        if i == 0:
            hour += 0.25
            x -= warp[i]
        elif i != 0 and (x > warp[i]):
            hour += 1
            x -= warp[i]
        else:
            hour += x/warp[i]
            x = 0
        i+=1

    BEGINNING = timedelta(hours=7,minutes=45, seconds=0)
    computedHour = BEGINNING + timedelta(hours=hour-2)

    return "{:0>8}".format(str(round_timedelta(computedHour, timedelta(minutes=15))))


In [32]:
def getFileText(path, rows):
    """
    Retrieves text and its cordinates from file
    """
    words = []
    
    with open(path, "rb") as file:

        rsrcmgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pageNumber, page in enumerate(PDFPage.get_pages(file)):
            
            if pageNumber == 0:
                interpreter.process_page(page)
                layout = device.get_result()

                for element in layout:
                    
                    if isinstance(element, LTTextBoxHorizontal):
                        t = tuple(e*(200/72) for e in element.bbox)
                        t = (t[0], rows-t[1], t[2], rows-t[3])
                        words.append((element.get_text().strip(), t))
    return words

In [33]:
def preprocessImage(image):
    """
    Apply a grayscale filter to an image
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    return gray

In [34]:
def isinBound(box, element):
    """
    Returns true if element is in bounds
    """
    return (box[0]<=element[0] and box[1]<=element[1] and box[2]>=element[2] and box[3]>=element[3])

In [35]:
def getLectureContent(cordinates, words):
    """
    Returns text contained in specific area
    """
    return [word[0] for word in words if isinBound(cordinates, word[1])]

In [36]:
def getDayHeight(week):
    """
    Returns cordinates of each day
    """
    rows, cols, _ = week.shape

    cordinatesDay = findContours(preprocessImage(week[0:rows, 0:140]), cv2.RETR_TREE, 80, 150)
    cordinatesDay = [cordinates for cordinates in cordinatesDay if cordinates[3]<80 and cordinates[3]>50]    
    cordinatesDay.sort(key = lambda x: x[1])

    if len(cordinatesDay) == 0:
        return None

    if len(cordinatesDay) == 4:
        cordinatesDay.insert(0,cordinatesDay[0])

    nbDays = 5 if rows>300 else 1

    daysHeight = [ cordinate[3]+4 for cordinate in cordinatesDay[-nbDays:]]

    marginY = rows - (sum(daysHeight)+6)
    marginX = cordinatesDay[0][2]+6

    avgDayH = min(daysHeight)

    if len(cordinatesDay) != 0:
        return marginX, marginY, avgDayH, nbDays

In [37]:
def getWarpFactor(week):
    """
    Computes warp factor of each week (deprecated)
    """
    INDEX = []

    cordinatesH = findContours(preprocessImage(week), 15, 300)
    cordinatesH = [cordinates for cordinates in cordinatesH if cordinates[3] > 20]
    cordinatesH.sort(key = lambda x: (x[0],x[1]))
    
    # INDEX = [cordinates[2] for cordinates in cordinatesH[1:-1]]
    
    # cv2.imshow("", week)
    # cv2.waitKey(0)
    # cv2.destroyAllWindows()
    for i in range(1, len(cordinatesH)-1):
        INDEX.append(cordinatesH[i+1][0]-cordinatesH[i][0])
        
    return INDEX

In [38]:
def processWeek(week, weekCordinates, words, calendar, weekDate, INDEX):
    """
    Processes the weekly planner
    """
    semaine = [["Lundi",[]], ["Mardi", []], ["Mercredi", []], ["Jeudi",[]], ["Vendredi",[]]]
    rows, cols, _ = week.shape

    print("SEMAINE --")

    marginX, marginY, avgDayH, nbDays = getDayHeight(week)
    
    weekCordinates = (weekCordinates[0], weekCordinates[1], weekCordinates[2], weekCordinates[3])
    
    cordinatesLectures = findContours(preprocessImage(week), cv2.RETR_TREE, 145, 1000)
    cordinatesLectures.sort(key = lambda x: (x[0],x[1]))

    regex_location = re.compile(r'[a-z-A-Z]{1}[0-9]{1}-.*|Amphi .*|.*Zoom|.*ZOOM')
    
    for cordinates in cordinatesLectures:
        x = cordinates[0]
        y = cordinates[1]
        w = cordinates[2]
        h = cordinates[3]


        cv2.rectangle(week, (x,y), (x+w,y+h), (0,0,255), 2)
        
        lectureAbsoluteCordinates = (weekCordinates[0]+x, weekCordinates[1]+y, weekCordinates[0]+x+w, weekCordinates[1]+y+h)
        
        list = getLectureContent(lectureAbsoluteCordinates, words)
        
        if list and "ENTREPRISE" not in list:
            
            location = [regex_location.search(e).group() for e in list if regex_location.search(e)]
                        
            if len(location) > 0:
                location = location[0]
            else:
                location = ""
            
            if nbDays == 1:
                addEvent(calendar, ' '.join(list), None, location, f"{weekDate} {getHour(x, INDEX, marginX)}", f"{weekDate} {getHour(x + w, INDEX, marginX)}")
                semaine[2][1].append(f"{' '.join(list)}\t[{getHour(x, INDEX, marginX)} - {getHour(x + w, INDEX, marginX)}]")

            else:
                dayofWeek = round(((y - marginY + 5) // avgDayH))
                dayofWeek = 0 if dayofWeek < 0 else dayofWeek

                dayDate = datetime.strftime(datetime.strptime(weekDate,'%Y-%m-%d') + timedelta(days=dayofWeek), '%Y-%m-%d')
                
                addEvent(calendar, ' '.join(list), None, location, f"{dayDate} {getHour(x, INDEX, marginX)}", f"{dayDate} {getHour(x+w, INDEX, marginX)}")
                semaine[dayofWeek][1].append(f"{' '.join(list)}\t[{getHour(x,  INDEX, marginX)} - {getHour(x + w, INDEX, marginX)}]")
    
    for jour in semaine:

        if len(jour[1]) != 0:
            print(f"{jour[0]}") 
        else:
            continue

        for matiere in jour[1]:

            print("\t", matiere)
    
    cv2.imwrite('../tests/output/detect-week-scanned-' + str(hash(datetime.now().strftime("%H:%M:%S.%f"))) + '.jpg', week)

In [39]:
def addEvent(calendar, lectureName, teacher, location, begin, end):
    """
    Adds event to calendar
    """
    e = Event(name=lectureName, description=teacher, location=location, begin=begin, end=end)

    calendar.events.add(e)

In [40]:
edt_url = ['https://stri.fr/Gestion_STRI/TAV/L3/EDT_STRI1A_L3IRT.pdf', 'https://stri.fr/Gestion_STRI/TAV/M1/EDT_STRI2A_M1RT.pdf', 'https://stri.fr/Gestion_STRI/TAV/M2/EDT_STRI3A_M2STRI.pdf']
path = os.path.dirname(os.getcwd())+'/tests/output/edt.pdf'
file = os.path.dirname(os.getcwd())+'/tests/output/edt-page1.jpg'
INDEX = 2

In [41]:
cleanup()

In [42]:
download_file(edt_url[INDEX],path)

In [43]:
pages = convert_from_path(path,200)[0].save(file, "JPEG")

In [44]:
image1 = cv2.imread(file, 0)

rows, cols = image1.shape
image1 = image1[0:rows-56, 0:cols-20]

image = cv2.imread(file)
image = image[0:rows-56, 0:cols-20]
duplimage = cv2.imread(file)[0:rows-56, 0:cols-20]

preprocess = cv2.erode(cv2.dilate(image1, np.ones((4,4), np.uint8), iterations=1), np.ones((3,3), np.uint8), iterations=1)
preprocess = cv2.morphologyEx(preprocess, cv2.MORPH_OPEN, np.ones((4,4), np.uint8))

cordinatesWeeks = findContoursWeek(preprocess, cv2.RETR_EXTERNAL, 1950, 2200)

cordinatesWeeks = [cordinates for cordinates in cordinatesWeeks if cordinates[3] > 50]

cordinatesWeeks.sort(key = lambda x: x[1])
weeks = []
words = getFileText(path, rows)

calendar = Calendar()

weeksD = []

regex_week = re.compile(r'[0-9]{2}/[a-z-é]{3}|[0-9]{2}-[a-z-é]{3}')

for line in words:
    if regex_week.search(line[0]):
        short_week = regex_week.search(line[0]).group()[:6]    
        weeksD.append(short_week)

months = {"jan":"01", "fév":"02", "mar":"03", "avr":"04", "mai":"05", "juin":"06", "jui":"06", "juil":"07", "aou":"08","sep":"09", "oct":"10", "nov":"11", "déc":"12"}

weeksD = [str(datetime.now().year)+"-"+months[re.split("/|-", d)[1]]+"-"+re.split("/|-", d)[0] for d in weeksD]


for cordinates, weekDate in zip(cordinatesWeeks, weeksD):
    x, y, w, h = cordinates
    
    weeks.append(image[y : y + h, x : x + w])
    
    cv2.imwrite('data/table/detect-week-'+str(len(weeks)-1)+'.jpg', weeks[-1])
    # cv2.rectangle(image, (x,y), (x+w,y+h), (0,0,255), 2)

    processWeek(weeks[-1], cordinates, words, calendar, weekDate, INDEX)

In [45]:
image1 = cv2.imread(file, 0)

rows, cols = image1.shape
image1 = image1[0:rows-56, 0:cols-20]

image = cv2.imread(file)
image = image[0:rows-56, 0:cols-20]
duplimage = cv2.imread(file)[0:rows-56, 0:cols-20]

preprocess = cv2.dilate(image1, np.ones((4,4), np.uint8), iterations=1)

cordinatesWeeks = findContours(preprocess, cv2.RETR_EXTERNAL, 1950, 2200)

cordinatesWeeks = [cordinates for cordinates in cordinatesWeeks if cordinates[3] > 50]

cordinatesWeeks.sort(key = lambda x: x[1])
weeks = []
words = getFileText(path, rows)

calendar = Calendar()

weeksD = []

regex_week = re.compile(r'[0-9]{2}/[a-z-é]{3}|[0-9]{2}-[a-z-é]{3}')

for line in words:
    if regex_week.search(line[0]):
        short_week = regex_week.search(line[0]).group()[:6]    
        weeksD.append(short_week)

months = {"jan":"01", "fév":"02", "mar":"03", "avr":"04", "mai":"05", "juin":"06", "jui":"06", "juil":"07", "aou":"08","sep":"09", "oct":"10", "nov":"11", "déc":"12"}

weeksD = [str(datetime.now().year)+"-"+months[re.split("/|-", d)[1]]+"-"+re.split("/|-", d)[0] for d in weeksD]


for cordinates, weekDate in zip(cordinatesWeeks, weeksD):
    x, y, w, h = cordinates
    
    weeks.append(image[y : y + h, x : x + w])
    
    cv2.imwrite(os.path.dirname(os.getcwd()) + '/tests/output/detect-week-'+str(len(weeks)-1)+'.jpg', weeks[-1])
    # cv2.rectangle(image, (x,y), (x+w,y+h), (0,0,255), 2)

    processWeek(weeks[-1], cordinates, words, calendar, weekDate, INDEX)

SEMAINE --
Lundi
	 U3-306 IoT/WoT MB/FC	[06:00:00 - 07:45:00]
	 Cloud Microsoft Azure / REX AZEO U3-306	[08:00:00 - 10:00:00]
	 Cloud Microsoft Azure / REX AZEO U3-306	[11:30:00 - 13:30:00]
	 Cloud Microsoft Azure / REX AZEO U3-306	[13:45:00 - 15:45:00]
Mardi
	 IoT/WoT FC U3-110	[08:00:00 - 10:00:00]
	 IoT/WoT (Suivi Travail) FC U3-306	[11:30:00 - 13:30:00]
	 IoT/WoT (Suivi Travail) MB U3-306	[13:45:00 - 15:45:00]
Mercredi
	 IoT/WoT MB U3-4	[08:00:00 - 10:00:00]
	 IOT (Suivi Travail) FC U3-307	[11:30:00 - 13:30:00]
	 (Travail sur Projet)	[13:45:00 - 15:45:00]
Jeudi
	 (Travail sur Projet)	[08:00:00 - 10:00:00]
	 IoT/WoT FC U3-306	[11:30:00 - 13:30:00]
	 IoT/WoT (Suivi Travail) FC U3-306	[13:45:00 - 15:45:00]
Vendredi
	 (Travail sur Projet)	[06:00:00 - 07:45:00]
	 IoT/WoT MB U3-306	[08:00:00 - 10:00:00]
	 APSYS (REX) RB / JLP U3-Amphi	[11:30:00 - 13:30:00]
SEMAINE --
Lundi
	 Cloud Microsoft Azure / REX AZEO A distance	[06:00:00 - 07:45:00]
	 Cloud Microsoft Azure / REX AZEO A distance	[0

In [46]:
filename = os.path.dirname(os.getcwd()) + '/tests/output/' + ("EDT_STRI1A_L3IRT","EDT_STRI2A_M1RT","EDT_STRI3A_M2STRI")[INDEX]+".ics"

with open(filename, 'w') as f:
    f.write(str(calendar))