## Milestone Object Creation

Fill logic 

In [1]:
import pandas as pd
import numpy as np
mdf = pd.read_csv('vaccineworkfile1.csv', encoding='utf8')

In [11]:
# Load factory milestones
from reference_milestones import milestones, get_milestone_renaming_schema


### EXISTS IN COMMON ###
from dateutil.parser import parse

def convert_to_datetime(time_string):
    try:
        date_val = parse(time_string)
        if pd.isnull(date_val):
            return None
        return date_val
    except:
        return None
#########################


# Define data preparation and filtering
def fill_product_id(dataframe: pd.DataFrame):
    if 'product_id' in dataframe.columns: 
        return dataframe
    else: 
        dataframe['product_id'] = list(
            dataframe.reindex(range(len(dataframe))).index
            )
        return dataframe

    
def remove_value(dictionary:dict, values:list):
    assert type(values) == list, 'Send values as list object'
    temp_list = list(dictionary.values())
    for value in values: temp_list.remove(value)
    return temp_list


def infer_status(value):
    if (value is None) or (value == np.nan):
        return np.nan
    elif convert_to_datetime(value) is not None: 
        return 'COMPLETED'
    elif str(value).strip() == 'SKIPPED':
        return 'SKIPPED'
    elif str(value).split(':')[0] == 'Target':
        return 'ESTIMATED'


def compare_max_completed(row, lookup):
    if row.milestone_id <= lookup[row.product_id]:
        return 'COMPLETED'
    return None

    
def clean_rename_data(dataframe: pd.DataFrame, renaming_schema:dict):
    temp = dataframe.rename(columns=renaming_schema)
    temp = temp.query("source == 'No'").copy()
    temp = fill_product_id(temp)
    return temp[renaming_schema.values()]


def melt_join_milestones(dataframe: pd.DataFrame, id_vars:list, value_vars:list):
    pivot_data = pd.melt(dataframe, id_vars=id_vars, value_vars=value_vars)
    pivot_data.columns=['product_id', 'name', 'date']
    return pivot_data.merge(
        pd.DataFrame(milestones), 
        how='left', 
        left_on='name', 
        right_on='name')


def build_status(dataframe: pd.DataFrame):
    def get_max_completed(dataframe: pd.DataFrame):
        dataframe['id_completed'] = dataframe.milestone_id * \
                                        (dataframe.status=='COMPLETED')
        return dataframe[['product_id', 'id_completed']]\
                    .groupby(by='product_id')\
                    .max().to_dict()['id_completed']
    
    def fill_completed(dataframe: pd.DataFrame, max_completed: dict):
        fill_status = []
        for i in range(len(dataframe)):
            if dataframe.iloc[i].status is None:
                fill_status.append(
                    compare_max_completed(row=dataframe.iloc[i], lookup=max_completed)
                )
            else:
                fill_status.append(dataframe.iloc[i].status)
        dataframe['status'] = fill_status
        return dataframe
        
        
    dataframe['status'] = dataframe.date.apply(infer_status)
    fill_completed(dataframe, get_max_completed(dataframe))
    return dataframe
    
def build_link_id(dataframe: pd.DataFrame):
    dataframe['link_id'] = dataframe.index.tolist()
    return dataframe

In [12]:
clean_data = clean_rename_data(mdf, get_milestone_renaming_schema())
formatted_data = melt_join_milestones(
    dataframe=clean_data, 
    id_vars=['product_id'], 
    value_vars=remove_value(get_milestone_renaming_schema(), ['product_id', 'source'])
)


In [13]:
build_status(formatted_data)
build_link_id(formatted_data)
formatted_data[formatted_data.status.notna()]

Unnamed: 0,product_id,name,date,milestone_id,category,status,id_completed,link_id
0,0,pre_clinical_studies,SKIPPED,12,pre-clinical,SKIPPED,0,0
72,0,lead_selection,1/13/2020,13,pre-clinical,COMPLETED,13,72
144,0,clinical_batch,2/7/2020,21,manufacturing,COMPLETED,21,144
216,0,ind,3/4/2020,31,regulatory,COMPLETED,31,216
288,0,phase_1,3/16/2020,41,clinical_development,COMPLETED,41,288
576,0,discovery,1/11/2020,11,pre-clinical,COMPLETED,11,576
