# Libraries

In [63]:
from fpdf import FPDF
import random
import string
from datetime import date, timedelta

# Template

In [199]:
margin = 5.0
partition = 30.0
pdf_w, pdf_h = (297, 210)
origin = (0,0)

row_h = partition / 4
current_info_w = pdf_w/2 - margin
curr_num_cols = 2*2
curr_col_w = current_info_w/curr_num_cols
posting_table_w = pdf_w - 2*margin
num_cols = 7
col_w = posting_table_w/num_cols

class PDF(FPDF):
    num_rows = 0
    
    def init_num_rows(self):
        self.num_rows = 0
    
    def table_borders(self):
        self.set_line_width(0.5)    
        
        # Current Info Table
        self.rect(origin[0] + margin, origin[1] + margin, pdf_w/2 - margin, partition)  
        
        for i in range(1, 4):
            self.line(origin[0] + margin, origin[1] + margin + i*row_h, pdf_w/2, origin[1] + margin + i*row_h)
        
        for i in range(1, 4):
            self.line(origin[0] + margin + i*curr_col_w, origin[1] + margin + row_h, origin[0] + margin + i*curr_col_w, origin[1] + margin + partition)
        
        # Posting History Table
        self.rect(origin[0] + margin, origin[1] + margin + partition, pdf_w - 2 * margin, pdf_h - 2 * margin - partition)
        
        self.line(origin[0] + margin, origin[1] + margin + partition + row_h, pdf_w - margin, origin[1] + margin + partition + row_h)
        self.line(origin[0] + margin, origin[1] + margin + partition + 2*row_h, pdf_w - margin, origin[1] + margin + partition + 2*row_h)
        
        for i in range(1, num_cols):
            self.line(origin[0] + margin + i*col_w, origin[1] + margin + partition + row_h, origin[0] + margin + i*col_w, pdf_h - margin)
        
        
    def header_text(self, emp_id):
        self.set_font("Arial", size = 9, style = 'B')
        
        # EMP_ID
        self.set_xy(origin[0] + margin, origin[1] + margin)
        self.cell(current_info_w, row_h, txt='Employee ID: ' + str(emp_id), align = 'C')
        
        # Current Info
        self.set_xy(origin[0] + margin, origin[1] + margin + row_h)
        self.cell(curr_col_w, row_h, txt='Employee Name:', align = 'C')
        
        self.set_xy(origin[0] + margin + 2*curr_col_w, origin[1] + margin + row_h)
        self.cell(curr_col_w, row_h, txt='Date Of Birth:', align = 'C')
        
        self.set_xy(origin[0] + margin, origin[1] + margin + 2*row_h)
        self.cell(curr_col_w, row_h, txt='Organization:', align = 'C')
        
        self.set_xy(origin[0] + margin + 2*curr_col_w, origin[1] + margin + 2*row_h)
        self.cell(curr_col_w, row_h, txt='Location:', align = 'C')
        
        self.set_xy(origin[0] + margin, origin[1] + margin + 3*row_h)
        self.cell(curr_col_w, row_h, txt='Department', align = 'C')
        
        self.set_xy(origin[0] + margin + 2*curr_col_w, origin[1] + margin + 3*row_h)
        self.cell(curr_col_w, row_h, txt='Designation:', align = 'C')
        
        # Posting History Title
        self.set_xy(origin[0] + margin, origin[1] + margin + partition)
        self.cell(posting_table_w, row_h, txt='Posting History', align = 'C')
        
        # Posting History
        self.set_xy(origin[0] + margin, origin[1] + margin + partition + row_h)
        self.cell(col_w, row_h, txt='Designation', align = 'C')
        
        self.set_xy(origin[0] + margin + col_w, origin[1] + margin + partition + row_h)
        self.cell(col_w, row_h, txt='Department', align = 'C')
        
        self.set_xy(origin[0] + margin + 2*col_w, origin[1] + margin + partition + row_h)
        self.cell(col_w, row_h, txt='Organization', align = 'C')
        
        self.set_xy(origin[0] + margin + 3*col_w, origin[1] + margin + partition + row_h)
        self.cell(col_w, row_h, txt='Location', align = 'C')
        
        self.set_xy(origin[0] + margin + 4*col_w, origin[1] + margin + partition + row_h)
        self.cell(col_w, row_h, txt='From Date', align = 'C')
        
        self.set_xy(origin[0] + margin + 5*col_w, origin[1] + margin + partition + row_h)
        self.cell(col_w, row_h, txt='To Date', align = 'C')
        
        self.set_xy(origin[0] + margin + 6*col_w, origin[1] + margin + partition + row_h)
        self.cell(col_w, row_h, txt='Pay Grade', align = 'C')
        
    
    def content_text(self, name, dob, org, loc, dept, title):
        self.set_font("Arial", size = 9)
        
        # Current Info
        self.set_xy(origin[0] + margin + curr_col_w, origin[1] + margin + row_h)
        self.cell(curr_col_w, row_h, txt=name, align = 'C')
        
        self.set_xy(origin[0] + margin + 3*curr_col_w, origin[1] + margin + row_h)
        self.cell(curr_col_w, row_h, txt=dob, align = 'C')
        
        self.set_xy(origin[0] + margin + curr_col_w, origin[1] + margin + 2*row_h)
        self.cell(curr_col_w, row_h, txt=org, align = 'C')
        
        self.set_xy(origin[0] + margin + 3*curr_col_w, origin[1] + margin + 2*row_h)
        self.cell(curr_col_w, row_h, txt=loc, align = 'C')
        
        self.set_xy(origin[0] + margin + curr_col_w, origin[1] + margin + 3*row_h)
        self.cell(curr_col_w, row_h, txt=dept, align = 'C')
        
        self.set_xy(origin[0] + margin + 3*curr_col_w, origin[1] + margin + 3*row_h)
        self.cell(curr_col_w, row_h, txt=title, align = 'C')

    def add_rows(self, record):
        base_X, base_Y = origin[0] + margin, origin[1] + margin + partition + 2*row_h
        
        for i, value in enumerate(record):
            self.set_xy(base_X + i*col_w, base_Y + self.num_rows*row_h)
            self.cell(col_w, row_h, txt=value, align = 'C')
        
        self.num_rows += 1

In [200]:
pdf = PDF(orientation = 'L')
 
pdf.add_page()

pdf.table_borders()
pdf.header_text(1)
pdf.content_text('Auchinto Chatterjee', '06/11/1998', 'AC Enterprise', 'Bengaluru, KA', 'Analytics', 'Analyst')
pdf.add_rows(['Summer Intern', 'Engineering', 'LTTS', 'Vadodara, GJ', '01/06/2018', '01/08/2018', 'N/A'])
pdf.add_rows(['Summer Intern', 'Engineering', 'Fivetran', 'Bengaluru, KA', '01/06/2019', '01/08/2019', 'N/A'])
pdf.add_rows(['SE Intern', 'Engineering', 'Fivetran', 'Bengaluru, KA', '01/01/2020', '31/05/2020', '300000'])
pdf.add_rows(['SE I', 'Engineering', 'Fivetran', 'Bengaluru, KA', '01/06/2020', '01/04/2022', '1200000'])
pdf.output("template_emp_history.pdf")

''

# Generating Bulk Data

- Page - Employee : One to one : for every employee add a new page
- Personal Details
    - Name (Title + First Name + Last Name)
        - Title: M: Mr, F: Ms/Mrs - Would help in Gender classification
    - DOB : Any Date in the range (1970 - 2000)
    - Organization: Remains same for all employee, since this whole dataset is belonging to the same org
    - Department: 
        - Random pick from a Set of prelisted dept(s)
    - Location:
        - Random pick from a Set of Strings made out of random letters, in the format (ABC, XY) - City, State
        - Can work on City - State Mapping so that we can do geographical analysis as well
    - Designation:
        - Random pick from a Set of prelisted title(s) mapped under specific dept(s)
- Posting History
    - Designation, Dept, Location follow the same rules as above
    - Organization follows the rule as Dept/Location/Designation
    - From_Date to To_Date:
        - Let Start at (DOB + 18 years)
        - Let End at (Present or Some Date in last 2 years)
        - Random number and Random size partitions between (Start, End) -> Each Interval will generate (From_Date, To_Date)
    - PayGrade
        - Follow Dept and Designation Mapping

In [36]:
num_employees = 300

## Personal Details

In [37]:
personal_headers = ['Employee Name', 'Date Of Birth', 'Organization', 'Location', 'Department', 'Designation']

### Employee Name

In [192]:
gender_title = ['Mr.', 'Ms.', 'Mrs.']

def random_string():
    temp = ''.join(random.choices(string.ascii_lowercase, k=random.randint(4, 8)))
    temp = temp[0].upper() + temp[1:]
    return temp

def first_name():
    return random_string()

def last_name():
    return random_string()

def get_name():
    return ' '.join([random.choice(gender_title), first_name(), last_name()])

In [62]:
get_name()

'Mr. Rieka Phtezbdcqcy'

### Date Of Birth

In [135]:
start_date, end_date = date(1970, 1, 1), date(2000, 1, 1)

def get_dob():
    return start_date + timedelta(days=random.randint(1, (end_date - start_date).days))

In [136]:
get_dob()

datetime.date(1983, 8, 25)

### Organization

In [105]:
def get_org_for_set():
    return random_string()

In [106]:
get_org_for_set()

'Vpkrony'

In [107]:
org_set = [get_org_for_set() for _ in range(100)]

def get_org():
    return random.choice(org_set)

In [108]:
get_org()

'Wdojesw'

### Department

In [85]:
depts = ['Engineering', 'Analytics', 'Sales', 'Security', 'Marketing', 'IT & Operations', 'Human Resources', 'Accounts', 'Product', 'Recruitment', 'Customer Support']

In [86]:
def get_department():
    return random.choice(depts)

In [89]:
get_department()

'Engineering'

### Location

In [100]:
def get_location_for_set():
    return random_string() + ', ' + ''.join(random.choices(string.ascii_uppercase, k=2))

In [101]:
get_location_for_set()

'Valsglzt, YU'

In [102]:
location_set = [get_location_for_set() for _ in range(100)]

def get_loc():
    return random.choice(location_set)

In [103]:
get_loc()

'Qfvgfppf, JR'

### Designation

In [124]:
designation = {
    'Engineering': ['SE Intern', 'Software Engineer I', 'Software Engineer II', 'Senior Software Engineer', 'Staff Software Engineer', 'Lead Software Engineer', 'Engineering Manager'],
    'Analytics': ['Junior Analyst', 'Senior Analyst', 'BI Analyst', 'Data Scientist'],
    'Sales': ['Junior Sales Engineer', 'Senior Sales Engineer', 'Regional Sales Manager'],
    'Security': ['Security Engineer I', 'Security Engineer II', 'Senior Security Engineer'],
    'Marketing': ['Junior', 'Senior', 'Manager'],
    'IT & Operations': ['IT Consultant', 'Operations Manager'],
    'Human Resources': ['HR'],
    'Accounts': ['Account Engineer', 'Account Manager'],
    'Product': ['Business Associate', 'Associate Product Manager', 'Product Manager'],
    'Recruitment':['Recruitment'],
    'Customer Support':['Junior Support Engineer', 'Senior Support Engineer', 'Customer Support Manager']
}

def get_designation(dept):
    return random.choice(designation[dept])

In [131]:
get_designation(get_department())

'Engineering Manager'

## Posting History

### From Date and To Date

In [155]:
def get_transition_dates(career_start, career_end):
    day_checkpoints = [career_start]
    
    curr_date = career_start
    interval_cnt = random.randrange(10)
    
    while curr_date != career_end and interval_cnt > 0:
        curr_date += timedelta(days=random.randint(1, (career_end - curr_date).days))
        day_checkpoints.append(curr_date)
        interval_cnt -= 1
        
    day_checkpoints.append(career_end)
    return day_checkpoints

def get_posting_intervals(career_start, career_end):
    checkpoints = get_transition_dates(career_start, career_end)
    
    intervals = [(checkpoints[0], checkpoints[1])]
    
    for i in range(1, len(checkpoints) - 1):
        intervals.append((checkpoints[i] + timedelta(days=1), checkpoints[i+1]))
    
    return intervals

In [158]:
dob = get_dob()
print(dob)
start = dob + timedelta(days=18*365)
end = date(2022,1,6)
print(start, end)

get_posting_intervals(start, end)

1983-06-30
2001-06-25 2022-01-06


[(datetime.date(2001, 6, 25), datetime.date(2002, 7, 29)),
 (datetime.date(2002, 7, 30), datetime.date(2015, 6, 5)),
 (datetime.date(2015, 6, 6), datetime.date(2020, 10, 3)),
 (datetime.date(2020, 10, 4), datetime.date(2020, 11, 10)),
 (datetime.date(2020, 11, 11), datetime.date(2020, 11, 26)),
 (datetime.date(2020, 11, 27), datetime.date(2021, 10, 8)),
 (datetime.date(2021, 10, 9), datetime.date(2021, 10, 22)),
 (datetime.date(2021, 10, 23), datetime.date(2022, 1, 6))]

### Paygrade

In [174]:
def get_pay():
    return random.randrange(300000, 3000000, 100000)

# Can be later provided with mapping respective to the departments and titles

In [175]:
get_pay()

1800000

# Generating Final PDF

In [201]:
num_employees = 300

pdf = PDF(orientation = 'L')
current_org = get_org()

for i in range(num_employees):
    pdf.init_num_rows()
    pdf.add_page()
    pdf.table_borders()
    pdf.header_text(i+1)
    
    name = get_name()
    dob = get_dob()
    org = current_org
    loc = get_loc()
    dept = get_department()
    title = get_designation(dept)
    
    pdf.content_text(name, str(dob), org, loc, dept, title)
    
    start, end = dob + timedelta(days=18*365), date(2022,1,6)
    
    intervals = get_posting_intervals(start, end)
    
    for i in range(len(intervals)-1):
        hist_dept = dept
        hist_org = get_org()
        hist_loc = get_loc()
        hist_title = get_designation(dept)
        hist_pay = get_pay()
        row = [hist_title, hist_dept, hist_org, hist_loc, str(intervals[i][0]), str(intervals[i][1]), str(hist_pay)]
        pdf.add_rows(row)
        
    pdf.add_rows([title, dept, org, loc, str(intervals[-1][0]), str(intervals[-1][1]), str(get_pay())])
    
pdf.output('employee_history.pdf')

''

# Limitations

1. Department remains fixed throughout the employment history for a given employee
2. There's no binding of growth and promotion through the designations across the time period, similarly no pattern enforced for paygrade