In [None]:
!pip install ics

In [385]:
from ics import Calendar, Event
c = Calendar()
e = Event()
e.name = "My cool event"
e.begin = '2014-01-01 00:00:00'
c.events.add(e)
c.events # [<Event 'My cool event' begin:2014-01-01 00:00:00 end:2014-01-01 00:00:01>]
with open('data/ignored/my.ics', 'w') as my_file:
    my_file.writelines(c.serialize_iter())

## Create calendar from PDF table

In [384]:
# From PDF
from unstructured.partition.auto import partition

path = "/Users/az/Downloads/raport.pdf"
elements = partition(filename=path)
with open("data/ignored/raport/raport_unstructured_raw.txt", "w") as fh:
    for e in elements:
        fh.write(str(e) + "\n")

In [299]:
with open("data/ignored/raport/ed/raport_unstructured_raw.txt", "r") as fh:
    content = fh.readlines()
    content = [x.strip() for x in content if x.strip() and x.strip() not in ["RPS"]]
    content = [x.replace("ONLINE", "|ONLINE")
               .replace("WSS", "|WSS")
               .replace("FB_3", "|FB_3")
               .replace("USK", "|USK")
               .replace("Rad_2_nr_4", "|Rad_2_nr_4")
               .replace("CSM_116_P", "|CSM_116_P")
               .replace("Rad_9_nr_2", "|Rad_9_nr_2")
               .replace("FB_5", "|FB_5")
               for x in content]

new_content = []
for c in content:
    new_content.extend(c.split("|"))
new_content = [x.strip() for x in new_content if x.strip()]

with open("data/ignored/raport/raport_unstructured_for_manual_clean.txt", "w") as fh:
    for line in new_content:
        fh.write(line + "\n")

In [351]:
with open("data/ignored/raport/raport_unstructured_for_manual_clean.txt", "r") as fh:
    content = fh.readlines()
    content = [x.strip() for x in content if x.strip()]

In [352]:
from dataclasses import dataclass

@dataclass
class Day:
    day: str

    @staticmethod
    def is_day(text):
        return text.endswith(("Pon.", "Wt.", "Śr.", "Czw.", "Pt.", "Sb.", "Nd."))
    
@dataclass
class Type:
    type: str

    @staticmethod
    def is_type(text):
        return text in ("Sem", "Ćwi", "Wyk")

@dataclass
class Hour:
    hour: str

@dataclass
class Details:
    day: Day
    type: Type
    hour: Hour

In [353]:
for i, c in enumerate(content):
    if isinstance(c, str) and Day.is_day(c):
        content[i] = Day(c)

In [354]:
for i, c in enumerate(content):
    if isinstance(c, str) and Type.is_type(c):
        content[i] = Type(c)
        content[i-1] = Hour(content[i-1])

In [355]:
# Replace 
# [
#     Day1
#     Hour1
#     Type1
#     Hour2
#     Type2
# ]
# with
# [
#     Details(Day1, Type1, Hour1)
#     Details(Day1, Type2, Hour2)
# ]
new_content = []
last_day = None
for i, c in enumerate(content):
    if isinstance(c, Day):
        last_day = c
    elif isinstance(c, Hour):
        new_content.append(Details(day=last_day, type=content[i+1], hour=c))
    elif isinstance(c, Type):
        pass
    else:
        new_content.append(c)
content = new_content

In [None]:
from itertools import groupby

new_content = []
for key, group in groupby(content, key=type):        # aggragate content list into groups of (n x Details + 4n x str) x 12 pages
    if key == Details:
        new_content.append(list(group))              # day, hour, type
    else:
        length = len(new_content[-1])
        items = list(group)
        new_content.append(items[:length])           # name
        new_content.append(items[length:2*length])   # group
        new_content.append(items[2*length:3*length]) # place
        new_content.append(items[3*length:])         # teacher
content = new_content
# content

In [None]:
# Each batch contains [[details1, details2, ...], [name1, name2, ...], [group1, group2, ...], [place1, place2, ...], [teacher1, teacher2, ...]]
batches = [content[start:end] for start, end in zip(range(0, len(content), 5), range(5, len(content), 5))]
# slices

In [377]:
from datetime import datetime
from ics import Calendar, Event

strp = "%d-%m-%Y %H:%M"

@dataclass
class Class:
    day: str
    type: str
    hour: str
    name: str
    group: str
    place: str
    teacher: str

    def __post_init__(self):
        try:
            self.start_hour, self.end_hour = self.hour.split("-")
        except Exception as e:
            print("ERROR ", self.hour, self.day, e)

    def to_event(self):
        e = Event()
        e.name = f"{self.name} ({self.group})"
        e.description = f"Prowadzący: {self.teacher}\nMiejsce: {self.place}"
        e.begin = datetime.strptime(f"{self.day.split(' ')[0]} {self.start_hour}", strp)
        e.end = datetime.strptime(f"{self.day.split(' ')[0]} {self.end_hour}", strp)
        return e

In [378]:
classes = []
for batch in batches:
    for a, b, c, d, e in zip(*batch):
        classes.append(Class(day=a.day.day, type=a.type.type, hour=a.hour.hour, name=b, group=c, place=d, teacher=e))
# classes

In [379]:
calendar = Calendar()
for c in classes:
    calendar.events.add(c.to_event())

In [380]:
with open('data/ignored/raport/raport.ics', 'w') as fh:
    fh.writelines(calendar.serialize_iter())