In [52]:

import requests, urllib.request, cv2, os, pdfplumber, re, numpy as np, datetime
from datetime import *
from math import *
from shutil import *
from pdf2image import convert_from_path
from collections import Counter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFResourceManager, PDFPageAggregator
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from ics import Calendar, Event

In [53]:
REGEX_HOUR = re.compile(r'[0-9]{1,2}h')
REGEX_DAY = re.compile(r'Lundi|Mardi|Mercredi|Jeudi|Vendredi|Samedi|Dimanche')
REGEX_LOCATION = re.compile(r'[a-z-A-Z]{1}[0-9]{1}-.*|Amphi .*|.*Zoom|.*ZOOM')
REGEX_WEEK = re.compile(r'[0-9]{2}/[a-z-À-ÿ]{3}')
TIMEZONE_DIFF = -2
MONTHS = {"jan":"01", "fév":"02", "mar":"03", "avr":"04", "mai":"05", "jui":"06", "jui":"06", "juil":"07", "aoû":"08","sep":"09", "oct":"10", "nov":"11", "déc":"12"}
DAYS = {"Lundi":"0","Mardi":"1","Mercredi":"2","Jeudi":"3","Vendredi":"4","Samedi":"5","Dimanche":"6"}

In [54]:
def cleanup():
    """
    Cleaning tests/ directory
    """
    dir_path = os.path.dirname(os.getcwd())

    rmtree(dir_path + "/tests/output")
    os.mkdir(dir_path + "/tests/output")

In [55]:
def download_file(url,path):
    """
    Downloads file from url
    """
    urllib.request.urlretrieve(url,path)

In [56]:
def showimage(image, title = ""):
    """
    Shows image passed as argument
    """
    cv2.imshow(title,image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [57]:
def findContours(image, retr_mode, minWidth, maxWidth):
    """
    Finds coutours of specific width in an image
    """
    ret, thresh_value = cv2.threshold(image, 130, 255, cv2.THRESH_BINARY_INV)
    kernel = np.ones((4,4), np.uint8)

    dilated_value = cv2.dilate(thresh_value, kernel, iterations = 1)
    contours, hierarchy = cv2.findContours(dilated_value, retr_mode, cv2.CHAIN_APPROX_SIMPLE)

    cordinates = []

    for contour in contours:
        x,y,w,h = cv2.boundingRect(contour)

        if w >= minWidth and w <= maxWidth:
            cordinates.append((x,y,w,h))

    return cordinates

In [58]:
def findContoursWeek(image, retr_mode, minWidth, maxWidth, avgHeight):
    """
    Finds coutours of specific width in an image
    """
    ret, thresh_value = cv2.threshold(image, 130, 255, cv2.THRESH_BINARY_INV)
    contours, hierarchy = cv2.findContours(thresh_value, retr_mode, cv2.CHAIN_APPROX_SIMPLE)

    cordinates = []

    for contour in contours:
        x,y,w,h = cv2.boundingRect(contour)

        if w >= minWidth and w <= maxWidth:
            nbWeek = round(h/avgHeight)
            for weekId in range (0,nbWeek):
                cordinates.append((x,y+weekId*(h//nbWeek),w,h//nbWeek))
    return cordinates

In [59]:
def findContoursDay(image, retr_mode, minWidth, maxWidth):
    """
    Finds coutours of specific width in an image
    """
    ret, thresh_value = cv2.threshold(image, 130, 255, cv2.THRESH_BINARY_INV)
    contours, hierarchy = cv2.findContours(thresh_value, retr_mode, cv2.CHAIN_APPROX_SIMPLE)

    cordinates = []

    for contour in contours:
        x,y,w,h = cv2.boundingRect(contour)

        if w >= minWidth and w <= maxWidth:
            cordinates.append((x,y,w,h))
    return cordinates

In [60]:
def getTimeWeek(words,coordinatesWeeks):
    """
        Get time coordonate from each week
    """
    # get hours words and sort list
    hours = [w for w in words if REGEX_HOUR.search(w[0])]
    hours.sort(key = lambda y: y[1][1])
    hours.sort(key = lambda x : x[1][0])

    # Create list for each week with tupple (coordinates,hour)
    hourWeek = [[] for j in range(len(coordinatesWeeks))]
    for h in hours:
        time, x1, y1, y2 = (int(re.sub(r'h', '', h[0])), floor(h[1][0]), floor(h[1][1]), floor(h[1][3]))
        for i,c in enumerate(coordinatesWeeks):
            y,h = (c[1],c[3])
            if y < y1 and y+h > y2:
                hourWeek[i].append((x1-5,time))
    
    # add the first hour and last hour coordinate with week borders
    timeWeek = [[] for j in range(len(hourWeek))]
    for idWeek,hourList in enumerate(hourWeek):
        xw,hw = (coordinatesWeeks[idWeek][0], coordinatesWeeks[idWeek][2])
        hourList.insert(0,(xw,hourList[0][1]-1))
        hourList.append((xw+hw,hourList[len(hourList)-1][1]+1))
        # cut hour in quarters
        for idHour in range(len(hourList)-1):
            xHour, wHour, tHour = (hourList[idHour][0], hourList[idHour+1][0] - hourList[idHour][0], hourList[idHour][1])
            for i in range(0,4):
                timeWeek[idWeek].append((xHour + round(i*wHour/4),tHour,15*i))
        lastHour = hourList[len(hourList)-1]
        timeWeek[idWeek].append((lastHour[0],lastHour[1],0))
    return timeWeek

In [61]:
def getHour(x, timeWeek):
    """
    Computes hour according to documents specifications
    """
    i = 0
    while abs(timeWeek[i][0] - x) > abs(timeWeek[i+1][0] - x):
        i+=1
    return '{:02}:{:02}:{:02}'.format(int(timeWeek[i][1] + TIMEZONE_DIFF), int(timeWeek[i][2]), int(0))


In [62]:
def removeText(image, cordinates, words):
   """
   Replaces text by a white area and returns
   """
   for word in words:
      x1, y1, x2, y2 = (floor(word[1][0])-cordinates[0], floor(word[1][1])-cordinates[1], floor(word[1][2])-cordinates[0], floor(word[1][3])-cordinates[1])
      
      cv2.rectangle(image, (x1,y1), (x2, y2), (255,255,255), -1)
      
   return image

In [63]:
def getFileText(path, rows):
    """
    Retrieves text and its cordinates from file
    """
    words = []
    
    with open(path, "rb") as file:

        rsrcmgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pageNumber, page in enumerate(PDFPage.get_pages(file)):
            
            if pageNumber == 0:
                interpreter.process_page(page)
                layout = device.get_result()

                for element in layout:
                    
                    if isinstance(element, LTTextBoxHorizontal):
                        for textLine in element._objs:
                            t = tuple(e*(200/72) for e in textLine.bbox)
                            t = (t[0], rows-t[1], t[2], rows-t[3])
                            words.append((textLine.get_text().strip(), t))
    return words

In [64]:
def grayscaleFilter(image):
    """
    Apply a grayscale filter to an image
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    return gray

In [65]:
def isinBound(box, element):
    """
    Returns true if element is in bounds
    """
    return (box[0]<=element[0] and box[1]<=element[1] and box[2]>=element[2] and box[3]>=element[3])

In [66]:
def getLectureContent(cordinates, words):
    """
    Returns text contained in specific area
    """
    return [word[0] for word in words if isinBound(cordinates, word[1])]

In [67]:
def getDayHeight(week):
    """
    Returns cordinates of each day
    """
    rows, cols, _ = week.shape

    cordinatesDay = findContoursDay(grayscaleFilter(week), cv2.RETR_TREE, 80, 150)
    cordinatesDay = [cordinates for cordinates in cordinatesDay if cordinates[3]<80 and cordinates[3]>45 and cordinates[0]<100]
    cordinatesDay.sort(key = lambda x: x[1])

    if len(cordinatesDay) == 0:
        return None, None, None, None

    if len(cordinatesDay) == 4:
        # when first line between table header and table body is absent
        cordinatesDay.insert(0, cordinatesDay[0])

    nbDays = 5 if rows>200 else 1
    
    daysHeight = [cordinate[3] for cordinate in cordinatesDay]

    marginY = cordinatesDay[0][1]
    # marginY = rows - (sum(daysHeight))
    marginX = cordinatesDay[0][0]+cordinatesDay[0][2]+5

    avgDayH = min(daysHeight)

    if len(cordinatesDay) != 0:
        return marginX, marginY, avgDayH, nbDays

In [68]:
def getAttendee(dayHeight, lectureY, lectureH):
   """
   returns the attendees of given lecture
   """
   if dayHeight/(lectureY) < dayHeight*(1/3):

      if lectureH < dayHeight*(2/3):
         return 1 # groupe 1
      else:
         return 0 # Classe entière
         
   else:
      return 2 # groupe2

In [69]:
def addEvent(calendar, lectureName, teacher, attendees, location, begin, end):
    """
    Adds event to calendar
    """
    e = Event(name=lectureName, description=teacher, location=location, begin=begin, end=end)

    calendar[attendees].events.add(e)

In [70]:
def processWeek(week, weekCordinates, words, calendar, weekDate, timeWeek):
    """
    Processes the weekly planner
    """
    rows, cols, _ = week.shape
    weekCordinates = (weekCordinates[0], weekCordinates[1], weekCordinates[2], weekCordinates[3])
    marginX, marginY, avgDayH, nbDays = getDayHeight(week)
    
    week = removeText(week, weekCordinates, words)
    top = week[0:30, 0:cols]
    side = week[0:rows, 0:125]
    
    cordinatesLectures = findContours(grayscaleFilter(week), cv2.RETR_TREE, 145, 1000)
    cordinatesLectures.sort(key = lambda x: (x[0],x[1]))
    
    for cordinates in cordinatesLectures:
        x = cordinates[0]
        y = cordinates[1]
        w = cordinates[2]
        h = cordinates[3]

        attendee = getAttendee(avgDayH, y-marginY, h)

        cv2.rectangle(week, (x,y), (x+w,y+h), (0,0,255), 2)
        
        lectureAbsoluteCordinates = (weekCordinates[0]+x, weekCordinates[1]+y, weekCordinates[0]+x+w, weekCordinates[1]+y+h)
        
        list = getLectureContent(lectureAbsoluteCordinates, words)
        
        if list and "ENTREPRISE" not in list:
            
            location = [REGEX_LOCATION.search(e).group() for e in list if REGEX_LOCATION.search(e)]
            location = "".join(location)
            
            if location != "":
                list.remove(location)
            
            if nbDays == 1:
                addEvent(calendar, list[0], list[1] if len(list) > 1 else None, attendee, location, f"{weekDate} {getHour(lectureAbsoluteCordinates[0], timeWeek)}", f"{weekDate} {getHour(lectureAbsoluteCordinates[2], timeWeek)}")
            else:
                dayofWeek = round(((y - marginY) // avgDayH))
                dayofWeek = 0 if dayofWeek < 0 else dayofWeek

                dayDate = datetime.strftime(datetime.strptime(weekDate,'%Y-%m-%d') + timedelta(days=dayofWeek), '%Y-%m-%d')
                
                addEvent(calendar, list[0], list[1] if len(list) > 1 else None, attendee, location, f"{dayDate} {getHour(lectureAbsoluteCordinates[0], timeWeek)}", f"{dayDate} {getHour(lectureAbsoluteCordinates[2], timeWeek)}")
    
    cv2.imwrite('../tests/output/detect-week-scanned-' + str(hash(datetime.now().strftime("%H:%M:%S.%f"))) + '.jpg', week)

In [71]:
edt_url = ['https://stri.fr/Gestion_STRI/TAV/L3/EDT_STRI1A_L3IRT.pdf', 'https://stri.fr/Gestion_STRI/TAV/M1/EDT_STRI2A_M1RT_TAV.pdf', 'https://stri.fr/Gestion_STRI/TAV/M2/EDT_STRI3A_M2STRI_22_23.pdf']
path = os.path.dirname(os.getcwd())+'/tests/output/edt.pdf'
file = os.path.dirname(os.getcwd())+'/tests/output/edt-page1.jpg'
INDEX = 2

In [72]:
cleanup()

In [73]:
download_file(edt_url[INDEX],path)

In [74]:
pages = convert_from_path(path,200)[0].save(file, "JPEG")

In [75]:
image1 = cv2.imread(file, 0)

rows, cols = image1.shape
image1 = image1[0:rows-56, 0:cols-20]

image = cv2.imread(file)
image = image[0:rows-56, 0:cols-20]

cordinatesWeeks = findContoursWeek(image1, cv2.RETR_EXTERNAL, 1900, 2200,300)
cordinatesWeeks = [cordinates for cordinates in cordinatesWeeks if cordinates[3] > 50]
cordinatesWeeks.sort(key = lambda x: x[1])


weeks = []
words = getFileText(path, rows)
timeWeeks = getTimeWeek(words,cordinatesWeeks)
words = [w for w in words if not REGEX_HOUR.search(w[0])]

calendar = [Calendar(),Calendar(),Calendar()] # Calendar All, Callendar G1, Callendar G2

weeksD = []

for line in words:
    if REGEX_WEEK.search(line[0]):
        short_week = REGEX_WEEK.search(line[0]).group()[:6]    
        weeksD.append(short_week)

weeksD = [str(datetime.now().year)+"-"+MONTHS[re.split("/|-", d)[1]]+"-"+re.split("/|-", d)[0] for d in weeksD]

for cordinates, weekDate in zip(cordinatesWeeks, weeksD):
    x, y, w, h = cordinates
    weeks.append(image[y : y + h, x : x + w])
    cv2.imwrite(os.path.dirname(os.getcwd()) + '/tests/output/detect-week-'+str(len(weeks)-1)+'.jpg', weeks[-1])
    
    processWeek(weeks[-1], cordinates, words, calendar, weekDate, timeWeeks[-1])

avgDayH: 54, y: 91, h: 51
avgDayH: 54, y: 204, h: 52
avgDayH: 54, y: 261, h: 51
avgDayH: 54, y: 148, h: 51
avgDayH: 54, y: 34, h: 52
avgDayH: 54, y: 91, h: 51
avgDayH: 54, y: 204, h: 52
avgDayH: 54, y: 261, h: 51
avgDayH: 54, y: 6, h: 23
avgDayH: 54, y: 34, h: 52
avgDayH: 54, y: 91, h: 51
avgDayH: 54, y: 148, h: 51
avgDayH: 54, y: 204, h: 23
avgDayH: 54, y: 261, h: 51
avgDayH: 54, y: 34, h: 52
avgDayH: 54, y: 91, h: 51
avgDayH: 54, y: 148, h: 51
avgDayH: 54, y: 233, h: 22
avgDayH: 54, y: 39, h: 52
avgDayH: 54, y: 96, h: 23
avgDayH: 54, y: 209, h: 23
avgDayH: 54, y: 266, h: 51
avgDayH: 54, y: 153, h: 51
avgDayH: 54, y: 39, h: 52
avgDayH: 54, y: 124, h: 23
avgDayH: 54, y: 209, h: 52
avgDayH: 54, y: 266, h: 51
avgDayH: 54, y: 39, h: 52
avgDayH: 54, y: 96, h: 51
avgDayH: 54, y: 153, h: 51
avgDayH: 54, y: 209, h: 52
avgDayH: 54, y: 266, h: 51
avgDayH: 54, y: 68, h: 22
avgDayH: 54, y: 96, h: 51
avgDayH: 54, y: 153, h: 51
avgDayH: 54, y: 209, h: 52
avgDayH: 54, y: 266, h: 51
avgDayH: 56, y: 9

In [76]:
for i in range(0,3):
    filename = os.path.dirname(os.getcwd()) + '/tests/output/' + ("EDT_STRI1A_L3IRT","EDT_STRI2A_M1RT","EDT_STRI3A_M2STRI")[INDEX]+ str(i) +".ics"
    with open(filename, 'w') as f:
        f.write(str(calendar[i]))