In [49]:
import requests, urllib.request, cv2, os, pdfplumber, re, numpy as np, datetime
from pdf2image import convert_from_path
from datetime import *
from math import *
from shutil import *
from collections import Counter
from pdfminer.layout import LAParams
from pdfminer.converter import PDFResourceManager, PDFPageAggregator
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.layout import LTTextBoxHorizontal
from ics import Calendar, Event
from classes import *

In [50]:
def cleanup():
    """
    Cleaning tests/ directory
    """
    dir_path = os.path.dirname(os.getcwd())

    rmtree(dir_path + "/tests/output")
    os.mkdir(dir_path + "/tests/output")

In [51]:
def download_file(url,path):
    """
    Downloads file from url
    """
    urllib.request.urlretrieve(url,path)

In [52]:
def showimage(image, title = ""):
    """
    Shows image passed as argument
    """
    cv2.imshow(title,image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [53]:
def findContours(image, retr_mode, minWidth, maxWidth):
    """
    Finds coutours of specific width in an image
    """
    ret, thresh_value = cv2.threshold(image, 130, 255, cv2.THRESH_BINARY_INV)
    kernel = np.ones((4,4), np.uint8)

    dilated_value = cv2.dilate(thresh_value, kernel, iterations = 1)
    contours, hierarchy = cv2.findContours(dilated_value, retr_mode, cv2.CHAIN_APPROX_SIMPLE)

    cordinates = []

    for contour in contours:
        x,y,w,h = cv2.boundingRect(contour)

        if w >= minWidth and w <= maxWidth:
            cordinates.append((x,y,w,h))

    return cordinates

In [54]:
def findContoursWeek(image, retr_mode, minWidth, maxWidth):
    """
    Finds coutours of specific width in an image
    """
    ret, thresh_value = cv2.threshold(image, 130, 255, cv2.THRESH_BINARY_INV)
    contours, hierarchy = cv2.findContours(thresh_value, retr_mode, cv2.CHAIN_APPROX_SIMPLE)

    cordinates = []

    for contour in contours:
        x,y,w,h = cv2.boundingRect(contour)

        if w >= minWidth and w <= maxWidth:
            cordinates.append((x,y,w,h))

    return cordinates

In [55]:
def round_timedelta(td, period):
    """
    Rounds time to the nearest quarter hour
    """
    period_seconds = period.total_seconds()
    half_period_seconds = period_seconds/2

    remainder = td.total_seconds() % period_seconds

    if remainder >= half_period_seconds:
        return timedelta(seconds=td.total_seconds() + (period_seconds - remainder))

    else:
        return timedelta(seconds=td.total_seconds() - remainder)

In [56]:
def getHour(x, index, margin):
    """
    Computes hour according to documents specifications
    """
    x -= (margin+5)

    hour = 0
    warpList = [[33, 169, 177, 150, 205, 139, 165, 182, 180, 168, 180, 167, 160],
                [36, 169, 173, 142, 220, 84, 132, 172, 209, 162, 166, 152, 160],
                [42, 177, 192, 170, 223, 110, 157, 200, 197, 218, 200]]
    warp = warpList[index]

    i = 0
    while x > 0 and i < len(warp):
        if i == 0:
            hour += 0.25
            x -= warp[i]
        elif i != 0 and (x > warp[i]):
            hour += 1
            x -= warp[i]
        else:
            hour += x/warp[i]
            x = 0
        i+=1

    BEGINNING = timedelta(hours=7,minutes=45, seconds=0)
    computedHour = BEGINNING + timedelta(hours=hour-2)

    return "{:0>8}".format(str(round_timedelta(computedHour, timedelta(minutes=15))))


In [57]:
def removeText(image, cordinates, words):
   """
   Replaces text by a white area and returns
   """
   for word in words:
      x1, y1, x2, y2 = (floor(word[1][0])-cordinates[0], floor(word[1][1])-cordinates[1], floor(word[1][2])-cordinates[0], floor(word[1][3])-cordinates[1])
      
      cv2.rectangle(image, (x1,y1), (x2, y2), (255,255,255), -1)
      
   return image

In [58]:
def getFileText(path, rows):
    """
    Retrieves text and its cordinates from file
    """
    words = []
    
    with open(path, "rb") as file:

        rsrcmgr = PDFResourceManager()

        laparams = LAParams()

        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        interpreter = PDFPageInterpreter(rsrcmgr, device)

        for pageNumber, page in enumerate(PDFPage.get_pages(file)):
            
            if pageNumber == 0:
                interpreter.process_page(page)
                layout = device.get_result()

                for element in layout:
                    
                    if isinstance(element, LTTextBoxHorizontal):
                        for textLine in element._objs:
                            t = tuple(e*(200/72) for e in textLine.bbox)
                            t = (t[0], rows-t[1], t[2], rows-t[3])
                            words.append((textLine.get_text().strip(), t))
    return words

In [59]:
def grayscaleFilter(image):
    """
    Apply a grayscale filter to an image
    """
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    return gray

In [60]:
def isinBound(box, element):
    """
    Returns true if element is in bounds
    """
    return (box[0]<=element[0] and box[1]<=element[1] and box[2]>=element[2] and box[3]>=element[3])

In [61]:
def getLectureContent(cordinates, words):
    """
    Returns text contained in specific area
    """
    return [word[0] for word in words if isinBound(cordinates, word[1])]

In [62]:
def getDayHeight(week):
    """
    Returns cordinates of each day
    """
    rows, cols, _ = week.shape

    cordinatesDay = findContoursWeek(grayscaleFilter(week), cv2.RETR_TREE, 80, 150)
    cordinatesDay = [cordinates for cordinates in cordinatesDay if cordinates[3]<80 and cordinates[3]>45 and cordinates[0]<100]
    cordinatesDay.sort(key = lambda x: x[1])


    if len(cordinatesDay) == 0:
        return None, None, None, None

    if len(cordinatesDay) == 4:
        # when first line between table header and table body is absent
        cordinatesDay.insert(0, cordinatesDay[0])

    nbDays = 5 if rows>200 else 1
    
    daysHeight = [cordinate[3] for cordinate in cordinatesDay]

    marginY = rows - (sum(daysHeight))
    marginX = cordinatesDay[0][0]+cordinatesDay[0][2]+5

    avgDayH = min(daysHeight)

    if len(cordinatesDay) != 0:
        return marginX, marginY, avgDayH, nbDays

In [63]:
def getWarpFactor(week):
    """
    Computes warp factor of each week (deprecated)
    """
    INDEX = []

    cordinatesH = findContours(grayscaleFilter(week), 15, 300)
    cordinatesH = [cordinates for cordinates in cordinatesH if cordinates[3] > 20]
    cordinatesH.sort(key = lambda x: (x[0],x[1]))

    for i in range(1, len(cordinatesH)-1):
        INDEX.append(cordinatesH[i+1][0]-cordinatesH[i][0])
        
    return INDEX

In [64]:
def getAttendee(dayHeight, lectureY, lectureH):
   """
   returns the attendees of given lecture
   """
   if dayHeight/(lectureY) < dayHeight*(1/3):

      if lectureH < dayHeight*(2/3):
         return "GROUPE 1"
      else:
         return "PROMO ENTIÈRE"
         
   else:
      return "GROUPE 2"

In [65]:
def addEvent(calendar, lectureName, teacher, attendees, location, begin, end):
    """
    Adds event to calendar
    """
    e = Event(name=lectureName, description=attendees, location=location, begin=begin, end=end)

    calendar.events.add(e)

In [66]:
def processWeek(week, weekCordinates, words, calendar, weekDate, INDEX):
    """
    Processes the weekly planner
    """
    rows, cols, _ = week.shape

    weekCordinates = (weekCordinates[0], weekCordinates[1], weekCordinates[2], weekCordinates[3])
    marginX, marginY, avgDayH, nbDays = getDayHeight(week)

    week = removeText(week, weekCordinates, words)
    cordinatesLectures = findContours(grayscaleFilter(week), cv2.RETR_TREE, 145, 1000)
    cordinatesLectures.sort(key = lambda x: (x[0],x[1]))

    regex_location = re.compile(r'[a-z-A-Z]{1}[0-9]{1}-.*|Amphi .*|.*Zoom|.*ZOOM')
    
    for cordinates in cordinatesLectures:
        x = cordinates[0]
        y = cordinates[1]
        w = cordinates[2]
        h = cordinates[3]
        print(f"avgDayH: {avgDayH}, y: {y}, h: {h}")

        attendee = getAttendee(avgDayH, y-marginY, h)

        cv2.rectangle(week, (x,y), (x+w,y+h), (0,0,255), 2)
        
        lectureAbsoluteCordinates = (weekCordinates[0]+x, weekCordinates[1]+y, weekCordinates[0]+x+w, weekCordinates[1]+y+h)
        
        list = getLectureContent(lectureAbsoluteCordinates, words)
        
        if list and "ENTREPRISE" not in list:
            
            location = [regex_location.search(e).group() for e in list if regex_location.search(e)]
            location = "".join(location)
            
            if location != "":
                list.remove(location)
            
            if nbDays == 1:
                addEvent(calendar, ' '.join(list), None, attendee, location, f"{weekDate} {getHour(x, INDEX, marginX)}", f"{weekDate} {getHour(x + w, INDEX, marginX)}")
            else:
                dayofWeek = round(((y - marginY) // avgDayH))
                dayofWeek = 0 if dayofWeek < 0 else dayofWeek

                dayDate = datetime.strftime(datetime.strptime(weekDate,'%Y-%m-%d') + timedelta(days=dayofWeek), '%Y-%m-%d')
                
                addEvent(calendar, ' '.join(list), None, attendee, location, f"{dayDate} {getHour(x, INDEX, marginX)}", f"{dayDate} {getHour(x+w, INDEX, marginX)}")
    
    cv2.imwrite('../tests/output/detect-week-scanned-' + str(hash(datetime.now().strftime("%H:%M:%S.%f"))) + '.jpg', week)

In [67]:
edt_url = ['https://stri.fr/Gestion_STRI/TAV/L3/EDT_STRI1A_L3IRT.pdf', 'https://stri.fr/Gestion_STRI/TAV/M1/EDT_STRI2A_M1RT_TAV.pdf', 'https://stri.fr/Gestion_STRI/TAV/M2/EDT_STRI3A_M2STRI_22_23.pdf']
path = os.path.dirname(os.getcwd())+'/tests/output/edt.pdf'
file = os.path.dirname(os.getcwd())+'/tests/output/edt-page1.jpg'
INDEX = 1

In [68]:
cleanup()

In [69]:
download_file(edt_url[INDEX],path)

In [70]:
pages = convert_from_path(path,200)[0].save(file, "JPEG")

In [71]:
image1 = cv2.imread(file, 0)

rows, cols = image1.shape
image1 = image1[0:rows-56, 0:cols-20]

image = cv2.imread(file)
image = image[0:rows-56, 0:cols-20]

cordinatesWeeks = findContoursWeek(image1, cv2.RETR_EXTERNAL, 1950, 2200)
cordinatesWeeks = [cordinates for cordinates in cordinatesWeeks if cordinates[3] > 50]
cordinatesWeeks.sort(key = lambda x: x[1])

weeks = []
words = getFileText(path, rows)

calendar = Calendar()

weeksD = []

regex_week = re.compile(r'[0-9]{2}/[a-z-À-ÿ]{3}')

for line in words:
    if regex_week.search(line[0]):
        short_week = regex_week.search(line[0]).group()[:6]    
        weeksD.append(short_week)

months = {"jan":"01", "fév":"02", "mar":"03", "avr":"04", "mai":"05", "jui":"06", "jui":"06", "juil":"07", "aoû":"08","sep":"09", "oct":"10", "nov":"11", "déc":"12"}

weeksD = [str(datetime.now().year)+"-"+months[re.split("/|-", d)[1]]+"-"+re.split("/|-", d)[0] for d in weeksD]

for cordinates, weekDate in zip(cordinatesWeeks, weeksD):
    x, y, w, h = cordinates
    
    weeks.append(image[y : y + h, x : x + w])
    
    cv2.imwrite(os.path.dirname(os.getcwd()) + '/tests/output/detect-week-'+str(len(weeks)-1)+'.jpg', weeks[-1])
    
    processWeek(weeks[-1], cordinates, words, calendar, weekDate, INDEX)

avgDayH: 48, y: 41, h: 49
avgDayH: 48, y: 95, h: 48
avgDayH: 48, y: 148, h: 48
avgDayH: 48, y: 228, h: 21
avgDayH: 48, y: 255, h: 46
avgDayH: 48, y: 41, h: 49
avgDayH: 48, y: 95, h: 48
avgDayH: 48, y: 148, h: 48
avgDayH: 48, y: 201, h: 22
avgDayH: 48, y: 255, h: 46
avgDayH: 48, y: 95, h: 21
avgDayH: 48, y: 148, h: 21
avgDayH: 48, y: 175, h: 21
avgDayH: 48, y: 201, h: 49
avgDayH: 48, y: 255, h: 46
avgDayH: 48, y: 121, h: 22
avgDayH: 48, y: 175, h: 21
avgDayH: 48, y: 201, h: 22
avgDayH: 48, y: 255, h: 21
avgDayH: 48, y: 279, h: 21
avgDayH: 48, y: 148, h: 21
avgDayH: 48, y: 228, h: 21
avgDayH: 48, y: 41, h: 49
avgDayH: 48, y: 95, h: 48
avgDayH: 48, y: 148, h: 21
avgDayH: 48, y: 175, h: 21
avgDayH: 48, y: 201, h: 22
avgDayH: 48, y: 228, h: 21
avgDayH: 48, y: 255, h: 46
avgDayH: 48, y: 41, h: 49
avgDayH: 48, y: 95, h: 48
avgDayH: 48, y: 148, h: 21
avgDayH: 48, y: 175, h: 21
avgDayH: 48, y: 228, h: 21
avgDayH: 48, y: 255, h: 21
avgDayH: 48, y: 41, h: 49
avgDayH: 48, y: 148, h: 21
avgDayH: 48

In [72]:
filename = os.path.dirname(os.getcwd()) + '/tests/output/' + ("EDT_STRI1A_L3IRT","EDT_STRI2A_M1RT","EDT_STRI3A_M2STRI")[INDEX]+".ics"

with open(filename, 'w') as f:
    f.write(str(calendar))