In [1]:
import pandas as pd 
import numpy as np
import random

random.seed(42)
np.random.seed(42)

In [4]:
# Function to generate subsets as tuples
def n_size_random_subset(s, length=1):
    return tuple(sorted(random.sample(s, random.randint(length, len(s)))))

def generate_unique_weights(df, subject_col, units_col, route_col, weight_col='WEIGHTS'):
    """
    Generate unique Dirichlet-distributed weights for each unique combination of 
    (SUBJECT_ID, UNITS, ROUTE) and assign them to the DataFrame.

    Parameters:
    - df (pd.DataFrame): Input DataFrame
    - subject_col (str): Column name for SUBJECT_ID
    - units_col (str): Column name for UNITS
    - route_col (str): Column name for ROUTE (tuple of ints)

    Returns:
    - pd.DataFrame: DataFrame with a new WEIGHTS column
    """
    # Ensure ROUTE column is hashable (tuple format)
    df[route_col] = df[route_col].apply(tuple)

    # Extract unique groups
    unique_groups = df.drop_duplicates(subset=[subject_col, units_col, route_col]).copy()

    # Generate Dirichlet weights for each unique group
    unique_groups['WEIGHTS'] = unique_groups[route_col].map(
        lambda route: tuple(np.round(np.random.dirichlet([1] * len(route), size=1),2)[0])
    )

    # Merge back to original DataFrame
    return df.merge(unique_groups, on=[subject_col, units_col, route_col], how='left')['WEIGHTS']
    
def generate_unique_sub_weights(df, subject_col, units_col, route_col):
    """
    Generate unique pairs of Dirichlet-distributed weights for each element in ROUTE, 
    ensuring that each pair sums to 1.

    Parameters:
    - df (pd.DataFrame): Input DataFrame
    - subject_col (str): Column name for SUBJECT_ID
    - units_col (str): Column name for UNITS
    - route_col (str): Column name for ROUTE (tuple of ints)

    Returns:
    - pd.DataFrame: DataFrame with a new column containing pairs of Dirichlet weights summing to 1
    """
    # Ensure ROUTE column is hashable (tuple format)
    df[route_col] = df[route_col].apply(tuple)

    # Extract unique groups
    unique_groups = df.drop_duplicates(subset=[subject_col, units_col, route_col]).copy()

    # Generate Dirichlet weight pairs for each element in ROUTE
    def generate_pairs(route):
        n = len(route)  # Number of elements in ROUTE
        pair_weights = np.round(np.random.dirichlet([1, 1], size=n),2)  # Generate n pairs of (w1, w2)
        return tuple(map(tuple, pair_weights))  # Convert to tuple of tuples

    unique_groups['SUB_WEIGHTS'] = unique_groups[route_col].map(generate_pairs)

    # Merge back to original DataFrame
    return df.merge(unique_groups, on=[subject_col, units_col, route_col], how='left')['SUB_WEIGHTS']
    
def map_subset_keys(df, keys_col, values_col, subset_col):
    """
    Parameters:
    - df (pd.DataFrame): Input DataFrame.
    - route_col (str): Column containing ROUTE (tuple of ints).
    - source_col (str): Column containing the source weights (tuple of weights).
    - exams_col (str): Column containing EXAMS (subset of ROUTE).
    - target_col (str): Column name for output weights (e.g., EXAM_WEIGHTS or EXAM_SUB_WEIGHTS).

    Returns:
    - pd.DataFrame: Updated DataFrame with the target column added.
    """

    # Precompute mappings from ROUTE to WEIGHTS
    keys_to_values_list = [dict(zip(keys, values)) for keys, values in zip(df[keys_col], df[values_col])]

    # Extract EXAM_WEIGHTS using list comprehension (vectorized approach)
    exam_weights = [
        tuple(keys_to_values.get(key, {}) for key in subset if key in og_keys)
        for keys_to_values, og_keys, subset in zip(keys_to_values_list, df[keys_col], df[subset_col])
    ]

    # Assign the result to the new column
    
    return exam_weights

In [5]:
# Adjusting the range to range(4)
random.seed(1)
np.random.seed(1)
data_small_range = []
for _ in range(100000):
    route = n_size_random_subset(range(1,5),2)  # subsets from range(4)
    exams = n_size_random_subset(route, 1)  # subsets from ROUTE
    data_small_range.append((route, exams))

df = pd.DataFrame(data_small_range, columns=['ROUTE', 'EXAMS'])
df['ID'] = np.random.permutation(np.arange(100_000_000, 100000000 + len(df)))
df['SUBJECT_ID'] = np.random.choice([16, 35], size=len(df))
df['UNITS'] = np.random.choice([4, 5], size=len(df))
df['MAPPING'] = df.groupby(['SUBJECT_ID', 'UNITS'])['ROUTE'].transform(lambda x: pd.factorize(x)[0])

df['WEIGHTS'] = generate_unique_weights(df, subject_col='SUBJECT_ID', units_col='UNITS', route_col='ROUTE')
df['PARTIAL_WEIGHTS'] = map_subset_keys(df, keys_col='ROUTE', values_col='WEIGHTS', subset_col='EXAMS')
df['SUB_WEIGHTS'] = generate_unique_sub_weights(df, subject_col='SUBJECT_ID', units_col='UNITS', route_col='ROUTE')
df['PARTIAL_SUB_WEIGHTS'] = map_subset_keys(df, keys_col='ROUTE', values_col='SUB_WEIGHTS', subset_col='EXAMS')
df['GRADES'] = [
    tuple((np.random.randint(0, 101), np.random.randint(0, 101)) for _ in range(len(t))) 
    for t in df['ROUTE']
]
df['PARTIAL_GRADES'] = map_subset_keys(df, keys_col='ROUTE', values_col='GRADES', subset_col='EXAMS')
df = df[['ID','SUBJECT_ID','UNITS','ROUTE','MAPPING','WEIGHTS','SUB_WEIGHTS','GRADES','EXAMS','PARTIAL_WEIGHTS','PARTIAL_SUB_WEIGHTS','PARTIAL_GRADES']]
df

Unnamed: 0,ID,SUBJECT_ID,UNITS,ROUTE,MAPPING,WEIGHTS,SUB_WEIGHTS,GRADES,EXAMS,PARTIAL_WEIGHTS,PARTIAL_SUB_WEIGHTS,PARTIAL_GRADES
0,100043660,35,5,"(1, 2)",0,"(0.53, 0.47)","((0.4, 0.6), (0.84, 0.16))","((2, 44), (67, 32))","(2,)","(0.47,)","((0.84, 0.16),)","((67, 32),)"
1,100087278,16,4,"(2, 3, 4)",0,"(0.08, 0.57, 0.35)","((0.06, 0.94), (0.66, 0.34), (0.59, 0.41))","((36, 62), (96, 84), (32, 99))","(2,)","(0.08,)","((0.06, 0.94),)","((36, 62),)"
2,100014317,35,4,"(1, 2, 3)",0,"(0.1, 0.33, 0.57)","((0.31, 0.69), (0.07, 0.93), (0.08, 0.92))","((96, 46), (78, 44), (7, 56))","(1, 2, 3)","(0.1, 0.33, 0.57)","((0.31, 0.69), (0.07, 0.93), (0.08, 0.92))","((96, 46), (78, 44), (7, 56))"
3,100081932,16,5,"(1, 2, 3, 4)",0,"(0.11, 0.59, 0.02, 0.28)","((0.2, 0.8), (0.25, 0.75), (0.17, 0.83), (0.83...","((50, 14), (77, 3), (87, 18), (31, 3))","(1,)","(0.11,)","((0.2, 0.8),)","((50, 14),)"
4,100095321,16,5,"(1, 2)",1,"(0.33, 0.67)","((0.21, 0.79), (0.78, 0.22))","((77, 47), (91, 81))","(2,)","(0.67,)","((0.78, 0.22),)","((91, 81),)"
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,100050057,35,4,"(1, 2, 3)",0,"(0.1, 0.33, 0.57)","((0.31, 0.69), (0.07, 0.93), (0.08, 0.92))","((38, 23), (43, 94), (65, 70))","(1, 2, 3)","(0.1, 0.33, 0.57)","((0.31, 0.69), (0.07, 0.93), (0.08, 0.92))","((38, 23), (43, 94), (65, 70))"
99996,100098047,35,5,"(3, 4)",5,"(0.27, 0.73)","((0.06, 0.94), (0.27, 0.73))","((90, 6), (85, 73))","(3, 4)","(0.27, 0.73)","((0.06, 0.94), (0.27, 0.73))","((90, 6), (85, 73))"
99997,100005192,35,5,"(1, 3, 4)",4,"(0.37, 0.4, 0.23)","((0.35, 0.65), (0.3, 0.7), (0.25, 0.75))","((72, 38), (75, 41), (58, 49))","(1, 3, 4)","(0.37, 0.4, 0.23)","((0.35, 0.65), (0.3, 0.7), (0.25, 0.75))","((72, 38), (75, 41), (58, 49))"
99998,100077708,35,5,"(2, 3, 4)",3,"(0.04, 0.06, 0.9)","((0.83, 0.17), (0.31, 0.69), (0.66, 0.34))","((37, 94), (5, 27), (29, 90))","(2, 3, 4)","(0.04, 0.06, 0.9)","((0.83, 0.17), (0.31, 0.69), (0.66, 0.34))","((37, 94), (5, 27), (29, 90))"


In [50]:
import pandas as pd
import numpy as np

def pivot_data_routes_with_routes(df: pd.DataFrame) -> pd.DataFrame:
    exploded = df.explode(['ROUTE', 'GRADES', 'WEIGHTS','SUB_WEIGHTS']).reset_index()
    grades = pd.DataFrame(exploded['GRADES'].tolist(), columns=['EXAM_GRADE', 'MAGEN_GRADE'])
    sub_weights = pd.DataFrame(exploded['SUB_WEIGHTS'].tolist(), columns=['EXAM_WEIGHT', 'MAGEN_WEIGHT'])
    weights = exploded['WEIGHTS']

    # weights already there, hence not in concat
    exploded = pd.concat([exploded, grades, sub_weights], axis=1)

    pivot_exam_grade = exploded.pivot(index='ID', columns='ROUTE', values='EXAM_GRADE')
    pivot_exam_grade.columns = [f"{col}_EXAM_GRADE" for col in pivot_exam_grade.columns]

    pivot_magen_grade = exploded.pivot(index='ID', columns='ROUTE', values='MAGEN_GRADE')
    pivot_magen_grade.columns = [f"{col}_MAGEN_GRADE" for col in pivot_magen_grade.columns]

    pivot_exam_weight = exploded.pivot(index='ID', columns='ROUTE', values='EXAM_WEIGHT')
    pivot_exam_weight.columns = [f"{col}_EXAM_WEIGHT" for col in pivot_exam_weight.columns]

    pivot_magen_weight = exploded.pivot(index='ID', columns='ROUTE', values='MAGEN_WEIGHT')
    pivot_magen_weight.columns = [f"{col}_MAGEN_WEIGHT" for col in pivot_magen_weight.columns]

    pivot_weights = exploded.pivot(index='ID', columns='ROUTE', values='WEIGHTS')
    pivot_weights.columns = [f"{col}_WEIGHT" for col in pivot_weights.columns]

    pivot =  pd.concat([pivot_exam_grade, pivot_magen_grade, pivot_exam_weight, pivot_magen_weight, pivot_weights], axis=1).sort_index(axis=1)
    return pivot

# Example usage:
np.random.seed(42)

complete_pivot_df = pivot_data_routes_with_routes(df)
display(complete_pivot_df)


Unnamed: 0_level_0,1_EXAM_GRADE,1_EXAM_WEIGHT,1_MAGEN_GRADE,1_MAGEN_WEIGHT,1_WEIGHT,2_EXAM_GRADE,2_EXAM_WEIGHT,2_MAGEN_GRADE,2_MAGEN_WEIGHT,2_WEIGHT,3_EXAM_GRADE,3_EXAM_WEIGHT,3_MAGEN_GRADE,3_MAGEN_WEIGHT,3_WEIGHT,4_EXAM_GRADE,4_EXAM_WEIGHT,4_MAGEN_GRADE,4_MAGEN_WEIGHT,4_WEIGHT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100000000,64.0,0.66,10.0,0.34,0.34,,,,,,,,,,,37.0,0.12,21.0,0.88,0.66
100000001,80.0,0.46,100.0,0.54,0.1,54.0,0.83,27.0,0.17,0.47,57.0,0.80,38.0,0.20,0.43,,,,,
100000002,62.0,0.97,21.0,0.03,0.38,0.0,0.04,20.0,0.96,0.05,94.0,0.12,86.0,0.88,0.57,,,,,
100000003,,,,,,,,,,,68.0,0.44,30.0,0.56,0.16,97.0,0.61,15.0,0.39,0.84
100000004,22.0,0.37,50.0,0.63,0.47,,,,,,,,,,,91.0,0.51,24.0,0.49,0.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100099995,,,,,,58.0,0.98,51.0,0.02,0.58,,,,,,11.0,0.86,47.0,0.14,0.42
100099996,61.0,0.67,64.0,0.33,0.04,94.0,0.13,70.0,0.87,0.57,43.0,0.61,39.0,0.39,0.27,54.0,0.18,19.0,0.82,0.12
100099997,13.0,0.61,10.0,0.39,0.06,,,,,,27.0,0.68,70.0,0.32,0.94,,,,,
100099998,96.0,0.58,94.0,0.42,0.41,78.0,0.64,17.0,0.36,0.59,,,,,,49.0,0.71,72.0,0.29,0.0


In [51]:
def pivot_data_routes(df: pd.DataFrame) -> pd.DataFrame:
    exploded = df.explode(['EXAMS', 'PARTIAL_GRADES', 'PARTIAL_WEIGHTS','PARTIAL_SUB_WEIGHTS']).reset_index()
    grades = pd.DataFrame(exploded['PARTIAL_GRADES'].tolist(), columns=['EXAM_GRADE', 'MAGEN_GRADE'])
    sub_weights = pd.DataFrame(exploded['PARTIAL_SUB_WEIGHTS'].tolist(), columns=['EXAM_WEIGHT', 'MAGEN_WEIGHT'])
    weights = exploded['PARTIAL_WEIGHTS']

    # weights already there, hence not in concat
    exploded = pd.concat([exploded, grades, sub_weights], axis=1)

    pivot_exam_grade = exploded.pivot(index='ID', columns='EXAMS', values='EXAM_GRADE')
    pivot_exam_grade.columns = [f"{col}_EXAM_GRADE" for col in pivot_exam_grade.columns]

    pivot_magen_grade = exploded.pivot(index='ID', columns='EXAMS', values='MAGEN_GRADE')
    pivot_magen_grade.columns = [f"{col}_MAGEN_GRADE" for col in pivot_magen_grade.columns]

    pivot_exam_weight = exploded.pivot(index='ID', columns='EXAMS', values='EXAM_WEIGHT')
    pivot_exam_weight.columns = [f"{col}_EXAM_WEIGHT" for col in pivot_exam_weight.columns]

    pivot_magen_weight = exploded.pivot(index='ID', columns='EXAMS', values='MAGEN_WEIGHT')
    pivot_magen_weight.columns = [f"{col}_MAGEN_WEIGHT" for col in pivot_magen_weight.columns]

    pivot_weights = exploded.pivot(index='ID', columns='EXAMS', values='PARTIAL_WEIGHTS')
    pivot_weights.columns = [f"{col}_WEIGHT" for col in pivot_weights.columns]

    pivot =  pd.concat([pivot_exam_grade, pivot_magen_grade, pivot_exam_weight, pivot_magen_weight, pivot_weights], axis=1).sort_index(axis=1)
    return pivot

# Example usage:
np.random.seed(42)

partial_pivot_df = pivot_data_routes(df)
display(partial_pivot_df)


Unnamed: 0_level_0,1_EXAM_GRADE,1_EXAM_WEIGHT,1_MAGEN_GRADE,1_MAGEN_WEIGHT,1_WEIGHT,2_EXAM_GRADE,2_EXAM_WEIGHT,2_MAGEN_GRADE,2_MAGEN_WEIGHT,2_WEIGHT,3_EXAM_GRADE,3_EXAM_WEIGHT,3_MAGEN_GRADE,3_MAGEN_WEIGHT,3_WEIGHT,4_EXAM_GRADE,4_EXAM_WEIGHT,4_MAGEN_GRADE,4_MAGEN_WEIGHT,4_WEIGHT
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
100000000,64.0,0.66,10.0,0.34,0.34,,,,,,,,,,,37.0,0.12,21.0,0.88,0.66
100000001,,,,,,54.0,0.83,27.0,0.17,0.47,,,,,,,,,,
100000002,,,,,,,,,,,94.0,0.12,86.0,0.88,0.57,,,,,
100000003,,,,,,,,,,,68.0,0.44,30.0,0.56,0.16,97.0,0.61,15.0,0.39,0.84
100000004,22.0,0.37,50.0,0.63,0.47,,,,,,,,,,,91.0,0.51,24.0,0.49,0.53
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100099995,,,,,,,,,,,,,,,,11.0,0.86,47.0,0.14,0.42
100099996,61.0,0.67,64.0,0.33,0.04,94.0,0.13,70.0,0.87,0.57,,,,,,54.0,0.18,19.0,0.82,0.12
100099997,13.0,0.61,10.0,0.39,0.06,,,,,,27.0,0.68,70.0,0.32,0.94,,,,,
100099998,,,,,,78.0,0.64,17.0,0.36,0.59,,,,,,,,,,


In [53]:
# Function to compute FINAL_GRADE for each exam_id
def compute_final_grades(pivot_df: pd.DataFrame) -> pd.DataFrame:
    # Identify unique exam IDs in the columns
    exam_ids = set(col.split('_')[0] for col in pivot_df.columns if '_EXAM' in col)

    # Compute FINAL_GRADE for each exam_id separately
    for exam_id in exam_ids:
        exam_col = f"{exam_id}_EXAM_GRADE"
        magen_col = f"{exam_id}_MAGEN_GRADE"
        exam_weight_col = f"{exam_id}_EXAM_WEIGHT"
        magen_weight_col = f"{exam_id}_MAGEN_WEIGHT"
        final_weight_col = f"{exam_id}_WEIGHT"

        # Ensure columns exist before calculation to avoid KeyErrors
        if all(col in pivot_df.columns for col in [exam_col, magen_col, exam_weight_col, magen_weight_col]):
            pivot_df[f"{exam_id}_~FINAL_GRADE"] =  pd.to_numeric(
                pivot_df[exam_col].fillna(0) * pivot_df[exam_weight_col].fillna(0) +
                pivot_df[magen_col].fillna(0) * pivot_df[magen_weight_col].fillna(0)
            ).replace(0, np.nan)

    pivot_df['~FINAL_GRADE'] = sum(pd.to_numeric(pivot_df[f'{exam_id}_~FINAL_GRADE'], errors='coerce').fillna(0) * 
                                   pd.to_numeric(pivot_df[f'{exam_id}_WEIGHT'], errors='coerce').fillna(0) 
                                   for exam_id in exam_ids)
    pivot_df = pivot_df.sort_index(axis=1)
    pivot_df.columns = [col.replace('~','') for col in pivot_df.columns]
    # pivot_df = pivot_df.apply(lambda col: pd.to_numeric(col, errors='coerce').replace(0, np.nan))
    return pivot_df

# Compute FINAL_GRADE for each exam_id
final_compute_pivot_df = compute_final_grades(complete_pivot_df)
final_partial_pivot_df = compute_final_grades(partial_pivot_df)
display(final_pivot_df)

Unnamed: 0_level_0,1_EXAM_GRADE,1_EXAM_WEIGHT,1_MAGEN_GRADE,1_MAGEN_WEIGHT,1_WEIGHT,1_FINAL_GRADE,2_EXAM_GRADE,2_EXAM_WEIGHT,2_MAGEN_GRADE,2_MAGEN_WEIGHT,...,3_MAGEN_WEIGHT,3_WEIGHT,3_FINAL_GRADE,4_EXAM_GRADE,4_EXAM_WEIGHT,4_MAGEN_GRADE,4_MAGEN_WEIGHT,4_WEIGHT,4_FINAL_GRADE,FINAL_GRADE
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100000000,64.0,0.66,10.0,0.34,0.34,45.64,,,,,...,,,,37.0,0.12,21.0,0.88,0.66,22.92,30.6448
100000001,,,,,,,54.0,0.83,27.0,0.17,...,,,,,,,,,,23.2227
100000002,,,,,,,,,,,...,0.88,0.57,86.96,,,,,,,49.5672
100000003,,,,,,,,,,,...,0.56,0.16,46.72,97.0,0.61,15.0,0.39,0.84,65.02,62.0920
100000004,22.0,0.37,50.0,0.63,0.47,39.64,,,,,...,,,,91.0,0.51,24.0,0.49,0.53,58.17,49.4609
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100099995,,,,,,,,,,,...,,,,11.0,0.86,47.0,0.14,0.42,16.04,6.7368
100099996,61.0,0.67,64.0,0.33,0.04,61.99,94.0,0.13,70.0,0.87,...,,,,54.0,0.18,19.0,0.82,0.12,25.30,47.1940
100099997,13.0,0.61,10.0,0.39,0.06,11.83,,,,,...,0.32,0.94,40.76,,,,,,,39.0242
100099998,,,,,,,78.0,0.64,17.0,0.36,...,,,,,,,,,,33.0636


In [54]:
target = final_compute_pivot_df['FINAL_GRADE'] - final_partial_pivot_df['FINAL_GRADE']

In [42]:
def get_filtered_data(df:pd.DataFrame, subject_id, units):
    return df[(df['SUBJECT_ID']==subject_id)&(df['UNITS']==units)]

def create_target(df:pd.DataFrame):
    complete_pivot_df = pivot_data_routes_with_routes(df)
    final_compute_pivot_df = compute_final_grades(complete_pivot_df)  
    # this is just for testing in the actual data we do need to get actual final grade with other methods
    final_grades = final_compute_pivot_df['FINAL_GRADE']
    
    partial_pivot_df = pivot_data_routes(df)
    final_partial_pivot_df = compute_final_grades(partial_pivot_df) 
    partial_grades = final_partial_pivot_df['FINAL_GRADE']
    
    df['TARGET'] = final_grades - partial_grades
    return df

def create_target(df:pd.DataFrame):
    complete_pivot_df = pivot_data_routes_with_routes(df)
    final_compute_pivot_df = compute_final_grades(complete_pivot_df)  
    # this is just for testing in the actual data we do need to get actual final grade with other methods
    final_grades = final_compute_pivot_df['FINAL_GRADE']
    
    partial_pivot_df = pivot_data_routes(df)
    final_partial_pivot_df = compute_final_grades(partial_pivot_df) 
    partial_grades = final_partial_pivot_df['FINAL_GRADE']
    
    df['TARGET'] = final_grades - partial_grades
    return df

def create_model(data: pd.DataFrame):
    complete_pivot_df = pivot_data_routes_with_routes(df)
    final_compute_pivot_df = compute_final_grades(complete_pivot_df)  
    # this is just for testing in the actual data we do need to get actual final grade with other methods
    final_grades = final_compute_pivot_df['FINAL_GRADE']
    
    partial_pivot_df = pivot_data_routes(df)
    final_partial_pivot_df = compute_final_grades(partial_pivot_df) 
    partial_grades = final_partial_pivot_df['FINAL_GRADE']
    
    partial_pivot_df['TARGET'] = final_grades - partial_grades

# for converting dicts
#  df['tuples_col'] = df['dict_col'].apply(lambda d: tuple(d.values()))

def main(complete_data_routes, subject_id, units):
    # filter people to SUBJECT_ID and UNITS
    pop = complete_data_routes[(complete_data_routes['SUBJECT_ID']==subject_id)&(complete_data_routes['UNITS']==units)]
    for mapping in pop.MAPPING.unique():
        mapping = int(mapping)
        print(f'creating model for ({subject_id=}, {units=}, {mapping=})')
        pop_filtered = pop[pop['MAPPING']==mapping]
        model = create_model(pop_filtered)