In [1]:
import pandas as pd
import re

from PyPDF2 import PdfReader

In [2]:
def get_title_cleanup() -> dict:
  return {
    'Senior Advisors': 'Senior Advisor',
    'Technology Specialists': 'Technology Specialist',
    'Special Assistants to the General Manager': 'Special Assistant to the General Manager',
    'Player Development Coordinators': 'Player Development Coordinator',
    'Minor League Video Coordinators': 'Minor League Video Coordinator',
    'Corporate Partnerships Sales Directors': 'Corporate Partnerships Sales Director',
    'Corporate Partnerships Sales Managers': 'Corporate Partnerships Sales Manager',
    'Suite Sales Account Executives': 'Suite Sales Account Executive',
    'Promotions & Special Events Coordinators': 'Promotions & Special Events Coordinator',
    'Media Relations Coordinators': 'Media Relations Coordinator',
    'Security Managers': 'Security Manager',
    'Park Operations Coordinators': 'Park Operations Coordinator',
    'Park Operations Assistants': 'Park Operations Assistant',
    'Ballpark Electricians': 'Ballpark Electrician',
    'Housekeeping Supervisors': 'Housekeeping Supervisor',
    'Producer Editors': 'Producer Editor',
    'New Business Representatives': 'New Business Representative',
    'Group Sales Senior Executives': 'Group Sales Senior Executive',
    'Group Sales Executives': 'Group Sales Executive',
    'Client Services Representatives': 'Client Services Representative',
    'Ticket Services Managers': 'Ticket Services Manager',
    'Box Office Administrators': 'Box Office Administrator',
    'Sales Representatives': 'Sales Representative',
    'Account Executives, Client Services': 'Account Executive, Client Services',
    'Account Executives, Group Sales': 'Account Executive, Group Sales',
    'Account Executives, New Business': 'Account Executive, New Business',
    'Accout Executives, Premium Sales': 'Account Executive, Premium Sales',
    'Analysts, Baseball Operations': 'Analyst, Baseball Operations',
    'Assistant Athletic Trainers': 'Assistant Athletic Trainer',
    'Coordinators, Corporate Partnership Activation': 'Coordinator, Corporate Partnership Activation',
    'Corporate Recruiters': 'Corporate Recruiter',
    'Full-Stack Software Engineers': 'Full-Stack Software Engineer',
    'Managers, Corporate Partnership Activation': 'Manager, Corporate Partnership Activation',
    'Managers, Ticket Services': 'Manager, Corporate Ticket Services',
    'Operations Managers': 'Operations Manager',
    'Promotions & Special Events Coordinators': 'Promotions & Special Events Coordinator',
    'Sales Executives, Premium Sales': 'Sales Executive, Premium Sales',
    'Sales Managers, Corporate Partnerships': 'Sales Manager, Corporate Partnerships',
    'Security Supervisors': 'Security Supervisor',
    'Senior Account Executives, Client Services': 'Senior Account Executive, Client Services',
    'Senior Analysts, Baseball Operations': 'Senior Analyst, Baseball Operations',
    'Solutions Managers, Corporate Partnerships': 'Solutions Manager, Corporate Partnerships',
    'Community Impact Managers': 'Community Impact Manager',
    'HVAC Technicians': 'HVAC Technician',
    'HVAC Techs': 'HVAC Technician',
    'Assistant Director, Amatuer Scouting': 'Assistant Director, Amateur Scouting'
  }

In [3]:
def get_department_cleanup(year:int) -> dict:
  results = {
    'Information T echnology': 'Information Technology',
    'Athletic T raining/Strength & Conditioning': 'Athletic Training/Strength & Conditioning',
    'T raining/Conditioning': 'Training/Conditioning',
    'Jim Leyland, Mike Russell, Alan Trammell': None,
    'Jillian Walker, T .J. Wyrebek': None,
    'Br ennan Socha, Corey Wolfgang': None,
    'Kar a Grabowski, Kate Laura': None,
    'Jim Le yland, Lance Parrish, Mike Russell, Alan Trammell': None,
    'Sar ah Stachowicz, Corey Thomas': None
  }

  if year == 2019:
    results['tigers.com / facebook.com/tigers / @Tigers / @TigresdeDetroit / @DetroitTigersPR'] = 'Executive Office'
  elif year in [2020, 2021, 2022]:
    results['tigers.com / facebook.com/tigers / @Tigers / @TigresdeDetroit / @DetroitTigersPR'] = 'Business Operations Leadership'

  return results

In [4]:
def get_start_index(year:int, team:str, page:int) -> int:
  if year == 2019 and team == 'DET' and page == 6:
    return 8
  else:
    return 0

In [5]:
def cleanup_name_title(year:int, team:str, title:str, original_name:str, prior_title:str):
  new_title = title
  new_name = str(original_name)

  if team == 'DET':
    if title in ['Jim Leyland, Mike Russell, Alan Trammell',
                 'Jillian Walker, T .J. Wyrebek',
                 'Br ennan Socha, Corey Wolfgang',
                 'Sar ah Stachowicz, Corey Thomas',
                 'Jim Le yland, Lance Parrish, Mike Russell, Alan Trammell',
                 'Brian Pick elsimer, Tyler VanderVlucht, Jillian Walker',
                 'Brandon Hoffman, Nic Lash, Corey O’Donnell, Matt Olinik, Autumn Sharp, Brennan Socha',
                 'Kar a Grabowski, Kate Laura',
                 'Mik e Russell, Alan Trammell',
                 'Maggie O’Har a, Danny Vargovick',
                 'Jeff P ongracz, Mike Graham',
                 'Thomas Kappel, Dono van Powell, John Wolski',
                 'Maggie O’Har a, Jonah Simon, Danny Vargovick',
                 'Jor dan Markowski, David Ocampo, Jacob Thomas, Kevin Wilson',
                 'Matt Olinik, Dr ew Padgen, Lindsey Ray']:
      new_title = prior_title
      new_name = title


  # Remove banned substrings
  banned_substrings = [
      'Dr.', 'O.D.', 'M.D.'
  ]

  for e in banned_substrings:
    new_name = new_name.replace(e, '')


  # Trim unwanted spaces and characters
  new_name = new_name.strip(' ').strip(',').replace("' ", "'")

  return new_title, new_name

In [6]:
def get_front_office_roster(pdf:PdfReader, year:int, team:str, first_page:int, last_page:int) -> pd.DataFrame:

  def get_department(row):
    if row['Name'] == 'None':
      return row['Title']
    else:
      return None


  df_main = pd.DataFrame()
  pages = range(first_page + 1, last_page + 2)

  for p in pages:
    pdf_page = pdf.getPage(p)
    start = get_start_index(year, team, p)

    records = [record for record in pdf_page.extractText().split('\n')][start:]

    title_name_split = [re.split(' ?\.\.+ ?', record) + [None, None] for record in records]
    df = pd.DataFrame([e[:2] + [year, team] for e in title_name_split],
                      columns=['Title', 'Name', 'Year', 'Team'])


    # Cleanup job titles
    df['Title'] = df['Title'].str.strip(' ')
    df['Title'] = df['Title'].replace(get_title_cleanup())
    df['Prior Title'] = df.Title.shift(1)


    # Cleanup names
    df[['Title', 'Name']] = df.apply(lambda x: cleanup_name_title(x.Year, x.Team, x.Title, x.Name, x['Prior Title']), axis=1, result_type='expand')
    df['Name'] = df['Name'].str.split(', ')


    # # # Expand name tuples to multiple rows, if necessary
    df = df.explode('Name', ignore_index=True)


    # # Determine the department
    df['Department'] = df.apply(lambda x: get_department(x), axis=1) \
                         .replace(get_department_cleanup(year))
    df['Department'] = df['Department'].fillna(method='ffill')


    # # Remove empty rows
    df = df[df.Name != 'None']


    # # Order the columns as specified by SABR BoB Committee,
    df = df[['Year', 'Team', 'Name', 'Title', 'Department']]


    # Update the main dataframe
    df_main = pd.concat([df_main, df]).reset_index(drop=True)

  return df_main

In [7]:
pages = {
  'DET': {
    2019: [6, 7],
    2020: [4, 5],
    2021: [4, 5],
    2022: [4, 5]
  }
}

final_df = pd.DataFrame()

for team, contents in pages.items():
  for year, pdf_pages in contents.items():
    first_page = pdf_pages[0]
    last_page = pdf_pages[1]
    temp_df = get_front_office_roster(
                PdfReader(f'../team-media-guides/{team}/{year}_{team}_Media_Guide.pdf'),
                year, team, first_page, last_page
              )
    
    final_df = pd.concat([final_df, temp_df])

In [10]:
final_df.to_csv('../out/det-front-office-rosters.csv', index=False, quoting=2)