In [1]:
import pandas as pd
import dateutil
import numpy as np
import datetime
import re
from collections import OrderedDict
pd.options.display.max_columns = None

In [2]:
cpdp_profiles = pd.read_csv("final-profiles.csv",
                            dtype={4:"str", 15: "str", "birth_year": "Int64", "link_UID": "Int64",
                                  "current_status": "Int64", "current_star": "Int64",
                                  "current_unit": "Int64"},
                            parse_dates=["appointed_date", "resignation_date", "start_date", "org_hire_date"]
                           ).set_index("UID", drop=False)

In [3]:
cpdp_reduced = cpdp_profiles.loc[:133667].copy() # tail end does not match cpdp online data

In [4]:
cpdp_reduced["middle_initial"] = cpdp_reduced["middle_initial"].fillna("").str.strip()

In [5]:
cpdp_reduced["UID"]=cpdp_reduced["UID"].astype("Int64")

In [6]:
cpdp_reduced.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 33667 entries, 100001 to 133667
Data columns (total 20 columns):
UID                 33667 non-null Int64
first_name          33667 non-null object
last_name           33667 non-null object
middle_initial      33667 non-null object
middle_initial2     8 non-null object
suffix_name         1689 non-null object
birth_year          32562 non-null Int64
race                33477 non-null object
gender              33643 non-null object
appointed_date      33594 non-null datetime64[ns]
resignation_date    20194 non-null datetime64[ns]
current_status      31789 non-null Int64
current_star        15364 non-null Int64
current_unit        31606 non-null Int64
current_rank        31473 non-null object
start_date          18971 non-null datetime64[ns]
org_hire_date       19786 non-null datetime64[ns]
profile_count       33667 non-null int64
cleaned_rank        31473 non-null object
link_UID            33667 non-null Int64
dtypes: Int64(6), datetim

In [7]:
current_prod=pd.read_csv(
    "Chicago_Police_Department_Officers.csv",
parse_dates=["employment date"], dtype={"birth year": "Int64", "badge number": "Int64"}
).set_index("id", drop=False)

In [8]:
current_prod.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11941 entries, 17511 to 11942
Data columns (total 13 columns):
id                    11941 non-null int64
unique identifier     0 non-null float64
last name             11941 non-null object
first name            11941 non-null object
middle initial        9705 non-null object
suffix                0 non-null float64
gender                11941 non-null object
race                  11941 non-null object
birth year            9699 non-null Int64
employment date       11941 non-null datetime64[ns]
badge number          11755 non-null Int64
job title             11941 non-null object
most recent salary    0 non-null float64
dtypes: Int64(2), datetime64[ns](1), float64(3), int64(1), object(6)
memory usage: 1.3+ MB


In [9]:
current_prod["middle initial"] = current_prod["middle initial"].fillna("").str.strip()
current_prod["last name"] = current_prod["last name"].str.strip()
current_prod["first name"] = current_prod["first name"].str.strip()

In [10]:
all_assignments=pd.read_csv("2_out_CPD_historic_assignments_grouped.csv",
                            parse_dates=["APPOINTED DATE", "END DATE", "EFFECTIVE DATE"],
                           dtype={"YEAR OF BIRTH": "Int64", "STAR NUMBER": "Int64",
                                 "cops_index": "Int64"})

In [11]:
all_assignments["MIDDLE INITIAL"] = all_assignments["MIDDLE INITIAL"].fillna("").str.strip()
all_assignments["LAST NAME"] = all_assignments["LAST NAME"].str.strip()
all_assignments["FIRST NAME"] = all_assignments["FIRST NAME"].str.strip()

In [12]:
all_assignments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35863 entries, 0 to 35862
Data columns (total 14 columns):
assignment_index    35863 non-null int64
FIRST NAME          35863 non-null object
MIDDLE INITIAL      35863 non-null object
LAST NAME           35863 non-null object
YEAR OF BIRTH       35863 non-null Int64
SEX                 35863 non-null object
RACE                35863 non-null object
APPOINTED DATE      35863 non-null datetime64[ns]
STAR TYPE           35863 non-null object
STAR NUMBER         35863 non-null Int64
EFFECTIVE DATE      35863 non-null datetime64[ns]
END DATE            35863 non-null datetime64[ns]
last_name_key       35863 non-null object
cops_index          35863 non-null Int64
dtypes: Int64(3), datetime64[ns](3), int64(1), object(7)
memory usage: 3.9+ MB


In [13]:
cops = all_assignments.groupby("cops_index").agg({
    "FIRST NAME": "first", "MIDDLE INITIAL": "first", "LAST NAME": "first",
    "YEAR OF BIRTH": "first", "SEX": "first", "RACE": "first", "APPOINTED DATE": "first",
    "EFFECTIVE DATE": "min","END DATE": "max"
}).reset_index()

In [14]:
cops.head()

Unnamed: 0,cops_index,FIRST NAME,MIDDLE INITIAL,LAST NAME,YEAR OF BIRTH,SEX,RACE,APPOINTED DATE,EFFECTIVE DATE,END DATE
0,0,A RICHARD,,LABRIOLA,1943,M,WHITE,1971-10-25,1987-04-27,2004-03-15 00:00:00.000000000
1,1,AARON,F,ACEVEDO,1982,M,HISPANIC,2009-12-16,2010-06-14,2262-04-11 23:47:16.854775807
2,2,AARON,V,ALLEY,1985,M,BLACK,2016-06-27,2016-12-28,2262-04-11 23:47:16.854775807
3,3,AARON,,ANDERSON,1981,M,WHITE,2017-10-16,2018-04-27,2262-04-11 23:47:16.854775807
4,4,AARON,P,ARMSTEAD,1975,M,BLACK,2005-12-05,2006-04-19,2262-04-11 23:47:16.854775807


In [15]:
def last_name_key_suffix(name):
    if name:
        n = re.sub(r"[^A-Z ]","",name.upper())
        pieces = n.split()
        suffix=""
        if pieces[-1] in ["JR", "SR", "V", "II", "III", "IV"]:
            suffix = pieces[-1]
            pieces = pieces[:-1]
        return ("".join(pieces), suffix)
    return ("", "")

In [16]:
cops["suffix"]=cops["LAST NAME"].apply(lambda x: last_name_key_suffix(x)[1])

In [17]:
cops["last_name"]=cops["LAST NAME"].apply(lambda x: last_name_key_suffix(x)[0])

In [18]:
merge_prod_cpdp = pd.read_csv("1_out_merge_key.csv")

In [19]:
merge_prod_cpdp = merge_prod_cpdp.rename(columns={"left_idx": "prod_id", "right_idx": "UID"})

In [20]:
merge_prod_cpdp.head()

Unnamed: 0,UID,prod_id,score
0,100007,11958,11.0
1,100009,12301,11.5
2,100010,13227,11.5
3,100011,13436,10.0
4,100012,13642,11.5


In [21]:
merge_hist_cops_cpdp = pd.read_csv("2_out_merge_cops_cpdp.csv")

In [22]:
merge_hist_cops_cpdp.head()

Unnamed: 0,index,UID
0,0,100006
1,1,100007
2,2,100008
3,4,100009
4,5,100010


In [23]:
cpdp_reduced.index.name = ""

In [24]:
step1=cpdp_reduced[["UID"]].merge(merge_prod_cpdp[["UID", "prod_id"]].astype("Int64"), on="UID", how="left")

In [25]:
step2=cops[["cops_index"]].merge(merge_hist_cops_cpdp.astype("Int64"),how="left",
                           left_on="cops_index", right_on="index").drop("index",axis=1)

In [26]:
complete_merge_table = step1.merge(step2.astype("Int64"), on="UID", how="outer")

In [27]:
complete_merge_table

Unnamed: 0,UID,prod_id,cops_index
0,100001,,
1,100002,,
2,100003,,
3,100004,,
4,100005,,
5,100006,,0
6,100007,11958,1
7,100008,,2
8,100009,12301,4
9,100010,13227,5


In [28]:
complete_merge_table.columns

Index(['UID', 'prod_id', 'cops_index'], dtype='object')

In [29]:
complete_merge_table.notna().groupby(['UID', 'prod_id', 'cops_index']).size()

UID    prod_id  cops_index
False  False    True           1367
True   False    False         11367
                True          10359
       True     False           751
                True          11190
dtype: int64

In [30]:
_.reset_index()

Unnamed: 0,UID,prod_id,cops_index,0
0,False,False,True,1367
1,True,False,False,11367
2,True,False,True,10359
3,True,True,False,751
4,True,True,True,11190


In [31]:
oos_assignments = pd.read_csv("Chicago_Police_Department_Assignments.csv",
                             dtype={"badge number": "Int64", "unit id": "Int64"})

In [32]:
oos_assignments.any()

id                            True
officer id                    True
officer unique identifier    False
badge number                  True
job title                     True
start date                   False
end date                     False
unit id                       True
dtype: bool

In [33]:
oos_assignments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11949 entries, 0 to 11948
Data columns (total 8 columns):
id                           11949 non-null int64
officer id                   11949 non-null int64
officer unique identifier    0 non-null float64
badge number                 11763 non-null Int64
job title                    11949 non-null object
start date                   0 non-null float64
end date                     0 non-null float64
unit id                      4 non-null Int64
dtypes: Int64(2), float64(3), int64(2), object(1)
memory usage: 770.2+ KB


In [34]:
all_assignments.groupby("STAR TYPE").size()

STAR TYPE
CAPTAIN STAR         286
DETECTIVE STAR      3091
LIEUTENANT STAR     1202
SERGEANT STAR       3814
SWORN-PO STAR      27470
dtype: int64

In [35]:
cpdp_reduced.head()

Unnamed: 0,UID,first_name,last_name,middle_initial,middle_initial2,suffix_name,birth_year,race,gender,appointed_date,resignation_date,current_status,current_star,current_unit,current_rank,start_date,org_hire_date,profile_count,cleaned_rank,link_UID
,,,,,,,,,,,,,,,,,,,,
100001.0,100001.0,A,BRODERSON,F,,,1926.0,WHITE,MALE,1950-06-03,1980-03-10,0.0,,,,NaT,NaT,3.0,,2863.0
100002.0,100002.0,A,DALESSANDRO,A,,,1923.0,WHITE,MALE,1956-09-01,1979-10-18,0.0,,,,NaT,NaT,3.0,,6100.0
100003.0,100003.0,A,KIBURAS,J,,,1948.0,WHITE,MALE,1968-06-24,1980-10-01,0.0,,,,NaT,NaT,3.0,,14560.0
100004.0,100004.0,A,KUSTRA,J,,,1923.0,WHITE,MALE,1953-09-28,1980-10-01,0.0,,,,NaT,NaT,3.0,,15445.0
100005.0,100005.0,A,MC LAUGHLIN,R,,,1941.0,BLACK,MALE,1966-06-20,1986-09-16,0.0,,123.0,POLICE OFFICER,NaT,NaT,3.0,POLICE OFFICER,18373.0


In [36]:
cpdp_reduced.groupby("current_rank").size()

current_rank
ASSISTANT SUPERINTENDENT        1
ASST DEPUTY SUP                25
ASST DIR NEWS AFFAIR            1
CAPTAIN OF POLICE             322
CHIEF                          29
CHIEF SYS PROG IS               1
CMDR                            3
CMDR TAFFIC ENFORCE             1
COMM NEIGHBORHOOD RE            2
COMM OF GENL SUPPORT            1
COMMANDER                     133
COMMANDER SPEC SERV             1
COMP TOOL MARK EXAM             1
COOR /INVESTIGATIONS            1
COOR SPEC EVENTS                2
COORDINATOR                     1
DEP CHIEF                      61
DEPUTY SUPT.                   23
DET                            16
DIR LABOR RELATIONS             1
DIR MGT/LAB AFFAIRS             1
DIR OF MOTOR MAINT              1
DIR PERSONNEL SERV              4
DIRECTOR OF CAPS                1
DIRECTOR OF RECORDS             1
DIST WATCH CO                   5
EXPL TECH 1                    50
EXPL TECH 3                     4
EXPLOSIVES TECH 2               1
E

In [37]:
cpdp_reduced.groupby("cleaned_rank").size()

cleaned_rank
ASSISTANT DEPUTY SUPERINTENDENT       25
ASSISTANT SUPERINTENDENT               1
CAPTAIN                              327
CHIEF                                 29
COMMANDER                            136
DEPUTY CHIEF                          61
DEPUTY SUPERINTENDENT                 23
DETECTIVE                           3067
DIRECTOR OF CAPS                       1
FIELD TRAINING OFFICER               341
FIRST DEPUTY SUPERINTENDENT            8
LIEUTENANT                           960
OTHER                                 24
POLICE OFFICER                     22966
SERGEANT                            3495
SUPERINTENDENT OF POLICE               6
SUPERINTENDENT'S CHIEF OF STAFF        3
dtype: int64

In [38]:
cpdp_profiles[cpdp_profiles["cleaned_rank"]=="OTHER"]

Unnamed: 0_level_0,UID,first_name,last_name,middle_initial,middle_initial2,suffix_name,birth_year,race,gender,appointed_date,resignation_date,current_status,current_star,current_unit,current_rank,start_date,org_hire_date,profile_count,cleaned_rank,link_UID
UID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100057,100057,ABEL,PRESAS,,,,1946,HISPANIC,MALE,1966-11-14,2005-06-30,0,,161,COMM OF GENL SUPPORT,NaT,1966-11-14,4,OTHER,22970
101858,101858,ARTHUR,PAHOLKE,R,,,1927,WHITE,MALE,1956-10-01,1987-04-01,0,,177,COMP TOOL MARK EXAM,NaT,NaT,3,OTHER,21646
102412,102412,BRADFORD,WOODS,L,,,1949,WHITE,MALE,1970-06-15,2007-05-15,0,221.0,123,DIR PERSONNEL SERV,1970-06-15,1970-06-15,7,OTHER,31184
105500,105500,DAVID,COFFEY,E,,,1930,WHITE,MALE,1955-05-02,1985-09-01,0,,151,CMDR TAFFIC ENFORCE,NaT,NaT,3,OTHER,4913
105536,105536,DAVID,DOUGHERTY,,,,1948,WHITE,MALE,1973-10-22,2003-11-02,0,,140,COOR SPEC EVENTS,NaT,1973-10-22,5,OTHER,7235
106535,106535,DOMINICK,FRIGO,J,,,1926,WHITE,MALE,1959-03-01,1986-06-01,0,,140,COOR SPEC EVENTS,NaT,NaT,3,OTHER,9194
107017,107017,DUWAYNE,HORNUNG,R,,,1929,WHITE,MALE,1957-07-01,1988-08-01,0,,122,MANAGER POL PAYROLL,NaT,NaT,3,OTHER,12548
108349,108349,EUGENE,NOLAN,A,,,1935,WHITE,MALE,1957-07-01,2000-04-16,0,,142,DIR LABOR RELATIONS,NaT,NaT,3,OTHER,20585
110252,110252,HARLEY,SCHINKER,,,,1947,WHITE,MALE,1969-09-08,2002-06-15,0,,173,DIR OF MOTOR MAINT,NaT,NaT,3,OTHER,25645
110336,110336,HARRY,BLACKBURN,A,,,1921,WHITE,MALE,1946-04-01,1984-09-06,0,,132,COMM NEIGHBORHOOD RE,NaT,NaT,3,OTHER,2092


In [39]:
set(oos_assignments["job title"])-set(cpdp_profiles["current_rank"])

{'GENERAL COUNSEL TO THE SUPERINTENDENT'}

In [40]:
clean_rank_dict = dict(cpdp_profiles[["current_rank", "cleaned_rank"]].itertuples(index=False))

In [41]:
set(clean_rank_dict.values())

{'ASSISTANT DEPUTY SUPERINTENDENT',
 'ASSISTANT SUPERINTENDENT',
 'CAPTAIN',
 'CHIEF',
 'COMMANDER',
 'DEPUTY CHIEF',
 'DEPUTY SUPERINTENDENT',
 'DETECTIVE',
 'DIRECTOR OF CAPS',
 'FIELD TRAINING OFFICER',
 'FIRST DEPUTY SUPERINTENDENT',
 'LIEUTENANT',
 'OTHER',
 'POLICE OFFICER',
 'SERGEANT',
 'SUPERINTENDENT OF POLICE',
 "SUPERINTENDENT'S CHIEF OF STAFF",
 nan}

In [42]:
all_assignments.groupby("STAR TYPE").size()

STAR TYPE
CAPTAIN STAR         286
DETECTIVE STAR      3091
LIEUTENANT STAR     1202
SERGEANT STAR       3814
SWORN-PO STAR      27470
dtype: int64

In [43]:
star_type_to_job_title = {"CAPTAIN STAR":'CAPTAIN',
 'DETECTIVE STAR': "DETECTIVE",
 'LIEUTENANT STAR': "LIEUTENANT",
 "SWORN-PO STAR": 'POLICE OFFICER',
 "SERGEANT STAR": 'SERGEANT',
}

In [44]:
sum(all_assignments["STAR TYPE"].isna())

0

In [45]:
all_assignments["cleaned_job"] = all_assignments["STAR TYPE"].apply(star_type_to_job_title.__getitem__)

In [46]:
all_assignments.head()

Unnamed: 0,assignment_index,FIRST NAME,MIDDLE INITIAL,LAST NAME,YEAR OF BIRTH,SEX,RACE,APPOINTED DATE,STAR TYPE,STAR NUMBER,EFFECTIVE DATE,END DATE,last_name_key,cops_index,cleaned_job
0,0,JEFFERY,M,AARON,1971,M,WHITE,2005-09-26,SWORN-PO STAR,13001,2006-02-22,2016-06-01 00:00:00.000000000,AARON,9198,POLICE OFFICER
1,1,JEFFERY,M,AARON,1971,M,WHITE,2005-09-26,SERGEANT STAR,1424,2016-06-01,2262-04-11 23:47:16.854775807,AARON,9198,SERGEANT
2,2,KARINA,,AARON,1980,F,HISPANIC,2005-09-26,SWORN-PO STAR,17545,2006-02-22,2012-09-02 00:00:00.000000000,AARON,11709,POLICE OFFICER
3,3,KARINA,,AARON,1980,F,HISPANIC,2005-09-26,SWORN-PO STAR,13705,2012-09-05,2017-04-14 00:00:00.000000000,AARON,11709,POLICE OFFICER
4,4,KARINA,,AARON,1980,F,HISPANIC,2005-09-26,DETECTIVE STAR,20373,2017-04-14,2262-04-11 23:47:16.854775807,AARON,11709,DETECTIVE


In [47]:
oos_assignments["cleaned_job"] = oos_assignments["job title"].apply(lambda x: clean_rank_dict.get(x,x))

In [48]:
oos_assignments.head()

Unnamed: 0,id,officer id,officer unique identifier,badge number,job title,start date,end date,unit id,cleaned_job
0,5574,17511,,5,COMMANDER,,,,COMMANDER
1,1,11942,,13001,POLICE OFFICER,,,,POLICE OFFICER
2,2,11943,,13705,POLICE OFFICER,,,,POLICE OFFICER
3,3,11944,,19596,POLICE OFFICER,,,,POLICE OFFICER
4,4,11945,,17673,POLICE OFFICER,,,,POLICE OFFICER


In [49]:
current_prod.groupby(["gender"]).size()

gender
F    2655
M    9286
dtype: int64

In [50]:
cpdp_profiles.groupby("gender").size()

gender
FEMALE     5779
MALE      27868
dtype: int64

In [51]:
cops.groupby("SEX").size()

SEX
F     5090
M    17826
dtype: int64

In [52]:
current_prod.groupby(["race"]).size()

race
AMER IND/ALASKAN NATIVE      36
ASIAN/PACIFIC ISLANDER      358
BLACK                      2683
UNKNOWN                       2
WHITE                      6180
WHITE HISPANIC             2682
dtype: int64

In [53]:
cpdp_profiles.groupby("race").size()

race
ASIAN/PACIFIC ISLANDER              539
BLACK                              7630
HISPANIC                           4579
NATIVE AMERICAN/ALASKAN NATIVE       66
WHITE                             20667
dtype: int64

In [54]:
cops.groupby("RACE").size()

RACE
AMER IND/ALASKAN NATIVE       58
ASIAN/PACIFIC ISLANDER       532
BLACK                       5170
BLACK HISPANIC                 7
HISPANIC                    2721
UNKNOWN                      174
WHITE                      12546
WHITE HISPANIC              1708
dtype: int64

In [55]:
(
    set(current_prod.groupby(["race"]).size().index)|
    set(cpdp_profiles.groupby("race").size().index)|
    set(cops.groupby("RACE").size().index)
)

{'AMER IND/ALASKAN NATIVE',
 'ASIAN/PACIFIC ISLANDER',
 'BLACK',
 'BLACK HISPANIC',
 'HISPANIC',
 'NATIVE AMERICAN/ALASKAN NATIVE',
 'UNKNOWN',
 'WHITE',
 'WHITE HISPANIC'}

In [56]:
def assignment(badge_number, job_title, start=None, end=None, id_=None, officer_id=None):
    # if end==pd.Timestamp.max:
    #    end = None
    return OrderedDict([("id", id_), ("officer_id", officer_id),
                        ("badge_number", badge_number), ("job_title", job_title),
                        ("start", start), ("end", end)])

def assignment_match(a, b):
    return ((pd.isna(b["badge_number"]) or b["badge_number"]==a["badge_number"])
        and (pd.isna(b["job_title"]) or b["job_title"]==a["job_title"]))

gender_str = {"MALE": "M", "FEMALE": "F", "M": "M", "F": "F"}
    
race_str = {'AMER IND/ALASKAN NATIVE': "NATIVE AMERICAN",
 'ASIAN/PACIFIC ISLANDER': "ASIAN",
 'BLACK': "BLACK",
 'BLACK HISPANIC': "HISPANIC",
 'HISPANIC': "HISPANIC",
 'NATIVE AMERICAN/ALASKAN NATIVE': "NATIVE AMERICAN",
 'UNKNOWN': None,
 'WHITE': "WHITE",
 'WHITE HISPANIC': "HISPANIC"}

suffix_str = {x:x for x in ["JR", "SR", "II", "III", "IV", "V"]}

def suffix_str(s):
    if pd.notna(s) and s.upper() in ["JR", "SR", "II", "III", "IV", "V"]:
        return s
    return None
    

def cop(first_name, last_name, middle, gender, race,
        employment_date, birth_year, last_employment_date, suffix=None, uid=None, id_=None):
    if last_employment_date==pd.Timestamp.max:
        last_employment_date = None
    return OrderedDict([("id", id_),("first_name", first_name), ("last_name", last_name),
                        ("middle_initial", middle),
           ("suffix", suffix_str(suffix)), ("gender", gender_str.get(gender, None)),
                        ("race", race_str.get(race, None)),
                        ("birth_year", birth_year),
           ("employment_date", employment_date), 
            ("last_employment_date", last_employment_date),("unique_internal_identifier", uid)])

def add_cop(c, d):
    for k in set(c.keys())|set(d.keys()):
        if pd.isna(c.get(k)):
            c[k] = d.get(k)
            
def fields_in_order(series, fields):
    return [series[f] for f in fields]

def cop_and_all_assignments(uid, prod_id, cops_index):
    if pd.notna(uid):
        cpdp_cop = cpdp_reduced.loc[uid]
        cop_dict = cop(*fields_in_order(cpdp_cop,["first_name", "last_name", "middle_initial", "gender",
                                          "race", "appointed_date", "birth_year",
                                          "resignation_date", "suffix_name","UID"]))
    else:
        cpdp_cop = None
        cop_dict = OrderedDict()

    if pd.notna(cops_index):
        hist_cop = cops.loc[cops_index]
        hist_cop_dict = cop(*fields_in_order(hist_cop, ["FIRST NAME", "last_name", "MIDDLE INITIAL",
                                           "SEX", "RACE", "APPOINTED DATE", "YEAR OF BIRTH",
                                           "END DATE", "suffix"]))
    else:
        hist_cop = None
        hist_cop_dict = OrderedDict()
        
    if pd.notna(prod_id):
        prod_cop = current_prod.loc[prod_id]
        cop_prod_id = prod_cop["id"]
    else:
        prod_cop = None
        if pd.notna(uid):
            cop_prod_id = "#U"+str(uid)
        elif pd.notna(cops_index):
            cop_prod_id = "#HA"+str(cops_index)
        else:
            raise Exception("No valid index")
    # print(cpdp_cop, prod_cop, hist_cop)
    
    
    add_cop(cop_dict, hist_cop_dict)
    cop_dict["id"] = cop_prod_id
    
    
    # Assignments
    assignments_df = all_assignments[all_assignments["cops_index"]==cops_index]
    assignments = [assignment(a["STAR NUMBER"], a["cleaned_job"], a["EFFECTIVE DATE"], a["END DATE"],
                             officer_id = cop_prod_id) for _, a in assignments_df.iterrows()]
    
    #print(assignments)
    # some cleanup happening here - assignments that overlap and have same badge number and job title
    # will be merged into one
    cleaning = {}
    for a in assignments:
        cleaning[(a["badge_number"], a["job_title"])] = []
    for a in assignments:
        cleaning[(a["badge_number"], a["job_title"])].append(a)
    reduced_assignments = []
    for value in cleaning.values():
        value.sort(key=lambda x: (x["start"], x["end"]))
        cur = None
        for a in value:
            if cur is None:
                cur = a
            else:
                if cur["end"]>=a["start"]:
                    cur["end"]=max(a["end"], cur["end"])
                else:
                    reduced_assignments.append(cur)
                    cur = a
        reduced_assignments.append(cur)
    assignments = reduced_assignments
    #done cleaning
    
    add_assignments = []
    if cpdp_cop is not None:
        add_assignments = [assignment(cpdp_cop["current_star"], cpdp_cop["cleaned_rank"],
                                      officer_id=cop_prod_id)]
    if prod_cop is not None:
        add_assignments_df = oos_assignments[oos_assignments["officer id"]==prod_id]
        # print(add_assignments_df)
        for _, a in add_assignments_df.iterrows():
            add_assignments.append(assignment(a["badge number"], a["cleaned_job"], id_=a["id"],
                                             officer_id=cop_prod_id))
    for add in add_assignments:
        match = [a for a in assignments if assignment_match(a, add)]
        if len(match)==0:
            if pd.notna(add["badge_number"]) or pd.notna(add["job_title"]):
                assignments.append(add)
        elif add["id"] is not None:
            match[0]["id"] = add["id"] # update one existing assignment
            
    for a in assignments:
        if a["end"] == pd.Timestamp.max:
            a["end"] = None
            
    links = []
    if (cpdp_cop is not None) and pd.notna(cpdp_cop["link_UID"]):
        link_uid = cpdp_cop["link_UID"]
        link = OrderedDict([("id",""), ("url",f"https://cpdp.co/officer/{link_uid}/"),
                           ("title","Officer's profile on CPDP"),("link_type","Link"),
                           ("author", "Citizens Police Data Project"), ("officer_ids",cop_prod_id)])
        links.append(link)
            
    return {"cop": cop_dict, "assignments": assignments, "links": links}

def create_tables(merge_table, department_name):
    cops = []
    assignments = []
    links=[]
    for val in merge_table.itertuples(index=False):
        #print(val)
        res = cop_and_all_assignments(*val)
        cops.append(res["cop"])
        assignments+=(res["assignments"])
        links+=res["links"]
    cops_df = pd.DataFrame.from_dict(cops)
    cops_df["id"]=cops_df["id"].astype("str")
    cops_df["unique_internal_identifier"] = cops_df["unique_internal_identifier"].apply(
            lambda x: str(int(float(x))) if pd.notna(x) else "")
    cops_df["birth_year"] = cops_df["birth_year"].astype("Int64")
    cops_df["department_name"] = department_name
    
    
    assignments_df = pd.DataFrame.from_dict(assignments)
    assignments_df.rename(columns={"start": "start_date", "end": "resign_date"}, inplace=True)
    assignments_df[["id", "badge_number"]]=assignments_df[["id", "badge_number"]].astype("Int64")
    
    
    return cops_df, assignments_df, pd.DataFrame.from_dict(links)
        

In [57]:
output_cops_df, output_assignments_df, links_df = create_tables(
    complete_merge_table, "Chicago Police Department")

In [58]:
output_cops_df.to_csv("3_out_CPD_officers.csv", index=False)

In [59]:
output_cops_df

Unnamed: 0,id,first_name,last_name,middle_initial,suffix,gender,race,birth_year,employment_date,last_employment_date,unique_internal_identifier,department_name
0,#U100001,A,BRODERSON,F,,M,WHITE,1926,1950-06-03,1980-03-10,100001,Chicago Police Department
1,#U100002,A,DALESSANDRO,A,,M,WHITE,1923,1956-09-01,1979-10-18,100002,Chicago Police Department
2,#U100003,A,KIBURAS,J,,M,WHITE,1948,1968-06-24,1980-10-01,100003,Chicago Police Department
3,#U100004,A,KUSTRA,J,,M,WHITE,1923,1953-09-28,1980-10-01,100004,Chicago Police Department
4,#U100005,A,MC LAUGHLIN,R,,M,BLACK,1941,1966-06-20,1986-09-16,100005,Chicago Police Department
5,#U100006,A RICHARD,LABRIOLA,,,M,WHITE,1943,1971-10-25,2004-03-15,100006,Chicago Police Department
6,11958,AARON,ACEVEDO,F,,M,HISPANIC,1982,2009-12-16,NaT,100007,Chicago Police Department
7,#U100008,AARON,ALLEY,V,,M,BLACK,1985,2016-06-27,NaT,100008,Chicago Police Department
8,12301,AARON,ARMSTEAD,P,,M,BLACK,1975,2005-12-05,NaT,100009,Chicago Police Department
9,13227,AARON,BURNS,C,,M,BLACK,1983,2013-06-03,NaT,100010,Chicago Police Department


In [60]:
output_assignments_df.to_csv("3_out_CPD_assignments.csv", index=False)

In [61]:
output_assignments_df

Unnamed: 0,id,officer_id,badge_number,job_title,start_date,resign_date
0,,#U100005,,POLICE OFFICER,NaT,NaT
1,,#U100006,15869,POLICE OFFICER,1987-04-27,2004-03-15
2,17,11958,7792,POLICE OFFICER,2010-06-14,2018-11-02
3,,11958,20009,DETECTIVE,2018-11-02,NaT
4,,#U100008,9516,POLICE OFFICER,2016-12-28,NaT
5,360,12301,13768,POLICE OFFICER,2006-04-19,NaT
6,1288,13227,4534,POLICE OFFICER,2013-11-12,NaT
7,,13436,13693,POLICE OFFICER,2005-05-24,2016-02-01
8,1497,13436,2349,SERGEANT,2016-02-03,2017-02-24
9,,13436,1140,SERGEANT,2017-02-24,NaT


In [62]:
links_df.to_csv("3_out_CPD_links.csv", index=False)

In [63]:
links_df

Unnamed: 0,id,url,title,link_type,author,officer_ids
0,,https://cpdp.co/officer/2863/,Officer's profile on CPDP,Link,Citizens Police Data Project,#U100001
1,,https://cpdp.co/officer/6100/,Officer's profile on CPDP,Link,Citizens Police Data Project,#U100002
2,,https://cpdp.co/officer/14560/,Officer's profile on CPDP,Link,Citizens Police Data Project,#U100003
3,,https://cpdp.co/officer/15445/,Officer's profile on CPDP,Link,Citizens Police Data Project,#U100004
4,,https://cpdp.co/officer/18373/,Officer's profile on CPDP,Link,Citizens Police Data Project,#U100005
5,,https://cpdp.co/officer/15521/,Officer's profile on CPDP,Link,Citizens Police Data Project,#U100006
6,,https://cpdp.co/officer/56/,Officer's profile on CPDP,Link,Citizens Police Data Project,11958
7,,https://cpdp.co/officer/411/,Officer's profile on CPDP,Link,Citizens Police Data Project,#U100008
8,,https://cpdp.co/officer/834/,Officer's profile on CPDP,Link,Citizens Police Data Project,12301
9,,https://cpdp.co/officer/3412/,Officer's profile on CPDP,Link,Citizens Police Data Project,13227
