Cole (2024), "Are there too many traits in our selection indices?"
=
This notebook reads the August 2024 format 38 (bulls) and CTeval files from the Council on Dairy Cattle Breeding and the conformation traits file from Holstein Association USA and merges all of the records together. The results are saved in a pickled file named "PTA/2408/all_traits.pkl". This results file is used to create the graphs in "Cole Selection Index Graphics.ipynb".

In [702]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

Creating Datasets
====
Run this code to merge the individual files together and create the pickled "master" file.

In [71]:
format38contents = {'species_code': 1, 'breed_eval': 2, 'breed_code': 2, 'country_code': 3, 'id_number': 12,
                    'sire_breed_code': 2, 'sire_country_code': 3, 'sire_id_number': 12, 'dam_breed_code': 2,
                    'dam_country_code': 3, 'dam_id_number': 12, 'mgs_breed_code': 2, 'mgs_country_code': 3,
                    'mgs_id_number': 12, 'dual_breed_code': 2, 'dual_country_code': 3, 'dual_id_number': 12,
                    'birth_date': 8, 'registry_status': 2, 'name': 30, 'short_name': 20, 'date_entered_ai': 6,
                    'sampling_status': 1, 'sampling_controller': 4, 'current_status': 1, 'naab_controller': 4,
                    'num_naab_sire_codes': 1, 'naab_code': 10, 'secondary_naab_codes': 30, 'herd_state_code': 2,
                    'herd_county_code': 2, 'herd_code': 4, 'most_daus_count': 4, 'state_most_daus': 2,
                    'age_first_calving': 2, 'pct_daus_first': 3, 'inbreeding': 3, 'avg_dau_inbreeding': 3,
                    'efi': 3, 'rel_yld': 2, 'rel_dpr': 2, 'pta_milk': 5, 'rel_mf': 2, 'pta_fat': 4,
                    'pta_fat_pct': 3, 'rel_prot': 2, 'pta_prot': 4, 'pta_prot_pct': 3, 'rel_pl': 2, 'pta_pl': 3,
                    'rel_scs': 2, 'pta_scs': 3, 'rel_nm$': 2, 'fm$': 5, 'nm$': 5, 'cm$': 5, 'net_merit_pct': 2,
                    'pta_dpr': 3, 'dpr_ib_usability': 1, 'avg_first_lac_dim_mf': 3, 'avg_first_lac_dim_prot': 3,
                    'avg_pl_age_wgt': 3, 'pedigree_completeness': 3, 'pct_mf_rip': 3, 'pct_prot_rip': 3,
                    'num_herds_dpr': 5, 'num_herds_mf': 5, 'num_herds_prot': 5, 'num_herds_pl': 5,
                    'num_herds_scs': 5, 'num_daus_dpr': 5, 'num_daus_mf': 5, 'num_daus_prot': 5,
                    'num_daus_pl': 5, 'num_daus_scs': 5, 'scs_ib_usability': 1, 'ib_pref_id_code': 1,
                    'ib_usability_pl': 1, 'mf_lac_per_dau': 3, 'prot_lac_per_dau': 3, 'heterosis': 3,
                    'mf_lac_mgmt_grp': 3, 'prot_lac_mgmt_grp': 3, 'predominant_breed_for_xbred': 2,
                    'avg_std_dpr': 3, 'avg_std_milk': 5, 'avg_std_fat': 4, 'avg_std_fat_pct': 2,
                    'avg_std_milk_prot': 5, 'avg_std_prot': 4, 'avg_std_prot_pct': 2, 'avg_dau_pl': 3,
                    'avg_std_scs': 3, 'num_countries_eval': 2, 'country_most_daus': 3, 'dyd_milk': 5,
                    'dyd_fat': 4, 'dyd_fat_pct': 3, 'dyd_milk_protein': 5, 'dyd_prot': 4, 'dyd_prot_pct': 3,
                    'dyd_pl': 4, 'dyd_scs': 4, 'pct_predominant_breed_for_xbred': 2, 'pa_milk': 5,
                    'rel_pa_mf': 2, 'pa_fat': 4, 'rel_pa_prot': 2, 'pa_prot': 4, 'rel_pa_pl': 2, 'pa_pl': 3,
                    'rel_pa_scs': 2, 'pa_scs': 3, 'pct_us_daus': 3, 'ib_usability_yield': 1, 'herdbook': 2,
                    'eval_restriction_code': 1, 'zeroes_future_use': 15, 'dyd_dpr': 4, 'pa_dpr': 3,
                    'rel_pa_dpr': 2, 'pa_nm': 5, 'rel_pa_nm': 2, 'scr': 4, 'rel_scr': 2, 'scr_breedings': 7,
                    'breed_code_clone': 2, 'country_code_clone': 3, 'id_number_clone': 12, 'genomic indicator': 1,
                    'secondary_naab_codes_contd': 20, 'pta_hcr': 4, 'rel_pta_hcr': 2, 'num_herds_hcr': 5,
                    'num_daus_hcr': 6, 'ib_usability_hcr': 1, 'pta_ccr': 4, 'rel_pta_ccr': 2, 'num_herds_ccr': 5,
                    'num_daus_ccr': 6, 'ib_usability_ccr': 1, 'pa_hcr': 4, 'rel_pa_hcr': 2, 'pa_ccr': 4,
                    'rel_pa_ccr': 2, 'chip_type': 2, 'genomic_inbreeding': 4, 'gfi': 4, 'gm$': 5, 'pta_liv': 4,
                    'rel_pta_liv': 2, 'num_herds_liv': 5, 'num_daus_liv': 6, 'pa_liv': 4, 'rel_pa_liv': 2,
                    'pta_gl': 3, 'rel_pta_gl': 2, 'num_herds_gl': 5, 'num_daus_gl': 6, 'pa_gl': 3, 'rel_pa_gl': 2,
                    'pta_mfev': 4, 'rel_pta_mfev': 2, 'num_herds_mfev': 5, 'num_daus_mfev': 6, 'pa_mfev': 4,
                    'rel_pa_mfev': 2, 'pta_dsab': 4, 'rel_pta_dsab': 2, 'num_herds_dsab': 5, 'num_daus_dsab': 6,
                    'pa_dsab': 4, 'rel_pa_dsab': 2, 'pta_keto': 4, 'rel_pta_keto': 2, 'num_herds_keto': 5,
                    'num_daus_keto': 6, 'pa_keto': 4, 'rel_pa_keto': 2, 'pta_mast': 4, 'rel_pta_mast': 2,
                    'num_herds_mast': 5, 'num_daus_mast': 6, 'pa_mast': 4, 'rel_pa_mast': 2, 'pta_metr': 4,
                    'rel_pta_metr': 2, 'num_herds_metr': 5, 'num_daus_metr': 6, 'pa_metr': 4, 'rel_pa_metr': 2,
                    'pta_retp': 4, 'rel_pta_retp': 2, 'num_herds_retp': 5, 'num_daus_retp': 6, 'pa_retp': 4,
                    'rel_pa_retp': 2, 'pta_efc': 4, 'rel_pta_efc': 2, 'num_herds_efc': 5, 'num_daus_efc': 6,
                    'pa_efc': 4, 'rel_pa_efc': 2, 'pta_hliv': 4, 'rel_pta_hliv': 2, 'num_herds_hliv': 5,
                    'num_daus_hliv': 6, 'pa_hliv': 4, 'rel_pa_hliv': 2,'pta_fsav': 5, 'rel_pta_fsav': 2,
                    'num_herds_fsav': 5, 'num_daus_fsav': 6, 'pa_fsav': 5, 'rel_pa_fsav': 2,}

names = list(format38contents.keys())
widths = list(format38contents.values())
print(widths)

[1, 2, 2, 3, 12, 2, 3, 12, 2, 3, 12, 2, 3, 12, 2, 3, 12, 8, 2, 30, 20, 6, 1, 4, 1, 4, 1, 10, 30, 2, 2, 4, 4, 2, 2, 3, 3, 3, 3, 2, 2, 5, 2, 4, 3, 2, 4, 3, 2, 3, 2, 3, 2, 5, 5, 5, 2, 3, 1, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 1, 3, 3, 3, 3, 3, 2, 3, 5, 4, 2, 5, 4, 2, 3, 3, 2, 3, 5, 4, 3, 5, 4, 3, 4, 4, 2, 5, 2, 4, 2, 4, 2, 3, 2, 3, 3, 1, 2, 1, 15, 4, 3, 2, 5, 2, 4, 2, 7, 2, 3, 12, 1, 20, 4, 2, 5, 6, 1, 4, 2, 5, 6, 1, 4, 2, 4, 2, 2, 4, 4, 5, 4, 2, 5, 6, 4, 2, 3, 2, 5, 6, 3, 2, 4, 2, 5, 6, 4, 2, 4, 2, 5, 6, 4, 2, 4, 2, 5, 6, 4, 2, 4, 2, 5, 6, 4, 2, 4, 2, 5, 6, 4, 2, 4, 2, 5, 6, 4, 2, 4, 2, 5, 6, 4, 2, 4, 2, 5, 6, 4, 2, 5, 2, 5, 6, 5, 2]


In [8]:
# This leaves things kind of a mess because some numeric columns are interpreted as integers, and others as floats.
def read_format_38(infile):
    fmt38 = pd.read_fwf(
        infile,
        names = names,
        widths = widths,
        header = None,
        index_col = False,
#        nrows = 2500,
        na_filter = True,
        na_values=[' '],
    )
    # We have to deal with issues like the inbreeding coefficient (XX.X%) being stored in 3 bytes, so
    # needs divide by 10.
    fmt38['inbreeding'] = fmt38['inbreeding'].astype('float') / 10.
    fmt38['avg_dau_inbreeding'] = fmt38['avg_dau_inbreeding'].astype('float') / 10.
    fmt38['efi'] = fmt38['efi'].astype('float') / 10.
    fmt38['pta_fat_pct'] = fmt38['pta_fat_pct'].astype('float') / 100.
    fmt38['pta_prot_pct'] = fmt38['pta_prot_pct'].astype('float') / 100.
    fmt38['pta_pl'] = fmt38['pta_pl'].astype('float') / 10.
    fmt38['pta_scs'] = fmt38['pta_scs'].astype('float') / 100.
    fmt38['pta_dpr'] = fmt38['pta_dpr'].astype('float') / 10.
    fmt38['avg_pl_age_wgt'] = fmt38['avg_pl_age_wgt'].astype('float') / 100.
    fmt38['mf_lac_per_dau'] = fmt38['mf_lac_per_dau'].astype('float') / 100.
    fmt38['prot_lac_per_dau'] = fmt38['prot_lac_per_dau'].astype('float') / 100.
    fmt38['avg_std_dpr'] = fmt38['avg_std_dpr'].astype('float') / 10.
    fmt38['avg_std_fat_pct'] = fmt38['avg_std_fat_pct'].astype('float') / 100.
    fmt38['avg_std_prot_pct'] = fmt38['avg_std_prot_pct'].astype('float') / 100.
    fmt38['avg_dau_pl'] = fmt38['avg_dau_pl'].astype('float') / 10.
    fmt38['avg_std_scs'] = fmt38['avg_std_scs'].astype('float') / 100.
    fmt38['dyd_fat_pct'] = fmt38['dyd_fat_pct'].astype('float') / 100.
    fmt38['dyd_prot_pct'] = fmt38['dyd_prot_pct'].astype('float') / 100.
    fmt38['dyd_pl'] = fmt38['dyd_pl'].astype('float') / 10.
    fmt38['dyd_scs'] = fmt38['dyd_scs'].astype('float') / 100.
    fmt38['pa_pl'] = fmt38['pa_pl'].astype('float') / 10.
    fmt38['pa_scs'] = fmt38['pa_scs'].astype('float') / 100.
    fmt38['dyd_dpr'] = fmt38['dyd_dpr'].astype('float') / 10.
    fmt38['pa_dpr'] = fmt38['pa_dpr'].astype('float') / 100.
    fmt38['scr'] = fmt38['scr'].astype('float') / 10.
    fmt38['pta_hcr'] = fmt38['pta_hcr'].astype('float') / 10.
    fmt38['pta_ccr'] = fmt38['pta_ccr'].astype('float') / 10.
    fmt38['pa_hcr'] = fmt38['pa_hcr'].astype('float') / 10.
    fmt38['pa_ccr'] = fmt38['pa_ccr'].astype('float') / 10.
    fmt38['genomic_inbreeding'] = fmt38['genomic_inbreeding'].astype('float') / 10.
    fmt38['gfi'] = fmt38['gfi'].astype('float') / 100.
    fmt38['pta_liv'] = fmt38['pta_liv'].astype('float') / 10.
    fmt38['pa_liv'] = fmt38['pa_liv'].astype('float') / 10.
    fmt38['pta_gl'] = fmt38['pta_gl'].astype('float') / 100.
    fmt38['pa_gl'] = fmt38['pa_gl'].astype('float') / 100.
    fmt38['pta_mfev'] = fmt38['pta_mfev'].astype('float') / 100.
    fmt38['pa_mfev'] = fmt38['pa_mfev'].astype('float') / 100.
    fmt38['pta_dsab'] = fmt38['pta_dsab'].astype('float') / 100.
    fmt38['pa_dsab'] = fmt38['pa_dsab'].astype('float') / 100.
    fmt38['pta_keto'] = fmt38['pta_keto'].astype('float') / 100.
    fmt38['pa_keto'] = fmt38['pa_keto'].astype('float') / 100.
    fmt38['pta_mast'] = fmt38['pta_mast'].astype('float') / 100.
    fmt38['pa_mast'] = fmt38['pa_mast'].astype('float') / 100.
    fmt38['pta_metr'] = fmt38['pta_metr'].astype('float') / 100.
    fmt38['pa_metr'] = fmt38['pa_metr'].astype('float') / 100.
    fmt38['pta_retp'] = fmt38['pta_retp'].astype('float') / 100.
    fmt38['pa_retp'] = fmt38['pa_retp'].astype('float') / 100.
    fmt38['pta_efc'] = fmt38['pta_efc'].astype('float') / 100.
    fmt38['pa_efc'] = fmt38['pa_efc'].astype('float') / 100.
    fmt38['pta_hliv'] = fmt38['pta_hliv'].astype('float') / 100.
    fmt38['pa_hliv'] = fmt38['pa_hliv'].astype('float') / 100.
    fmt38['pta_efc'] = fmt38['pta_efc'].astype('float') / 100.
    fmt38['pa_efc'] = fmt38['pa_efc'].astype('float') / 100.
    fmt38['id17'] = fmt38['breed_code'].map(str) + fmt38['country_code'].map(str) + fmt38['id_number'].map(str)
    
    return fmt38

In [9]:
def read_format_CT(infile):
    fmtCT = pd.read_fwf(
        infile,
        names = ['breed_eval', 'breed_code', 'country_code', 'id_number', 'genomic_indicator', 'naab_code', 'short_name',
                'naab_controlled', 'num_calvings_sce', 'num_herds_sce', 'rel_sce', 'sol_sce', 'pta_sce', 'pct_domestic_sce',
                'country_most_sce', 'num_countries_sce', 'eval_source_sce', 'officiality_sce', 'num_calvings_dce',
                'num_daus_dce', 'num_herds_dce', 'rel_dce', 'sol_dce', 'pta_dce', 'pct_domestic_dce', 'country_most_dce',
                'num_countries_dce', 'eval_source_dce', 'officiality_dce', 'num_calvings_ssb', 'num_herds_ssb', 'rel_ssb',
                'sol_ssb', 'pta_ssb', 'pct_domestic_ssb', 'country_most_ssb', 'num_countries_ssb', 'eval_source_ssb',
                'officiality_ssb', 'num_calvings_dsb', 'num_daus_dsb', 'num_herds_dsb', 'rel_dsb', 'sol_dsb', 'pta_dsb',
                'pct_domestic_dsb', 'country_most_dsb', 'num_countries_dsb', 'eval_source_dsb', 'officiality_dsb'],
        colspecs = [(0,2), (2,4), (4,7), (7,19), (20,21), (22,32), (33,53), (54,58), (59,64), (65,70), (71,73),
                    (74,81), (82,86), (87,90), (91,94), (95,97), (98,99), (100,101), (102,107), (108,113),
                    (114,119), (120,122), (123,130), (131,135), (136,139), (140,143), (144,146), (147,148),
                    (149,150), (151,156), (157,162), (163,165), (166,173), (174,178), (179,182), (183,186),
                    (187,189), (190,191), (192,193), (194,199), (200,205), (206,211), (212,214), (215,222),
                    (223,227), (228,231), (232,235), (236,239), (239,240), (241,242)],
        #widths = [2, 2, 3, 12, 1, 10, 20, 4, 5, 5, 2, 7, 4, 3, 3, 2, 1, 1, 5, 5, 5, 2, 7, 4, 3, 3, 2, 1, 1,
        #         5, 5, 2, 7, 4, 3, 3, 2, 1, 1, 5, 5, 5, 2, 7, 4, 3, 3, 2, 1, 1],
        header = None,
        index_col = False,
        #nrows = 2500,
        #na_filter = True,
        #na_values=[' '],
    )
    fmtCT['id17'] = fmtCT['breed_code'].map(str) + fmtCT['country_code'].map(str) + fmtCT['id_number'].map(str)
    return fmtCT

The *typeall* file is distributed by Holstein Association USA and contains the conformation traits and composites for all Holstein bulls. The file format is here: https://www.holsteinusa.com/dhs/us/ownnot.html.

In [10]:
pd.set_option('display.max_columns', None)

In [11]:
def read_typeall(infile):
    typeall = pd.read_fwf(
        infile,
        names = [
                 'breed_code', 'country_code', 'id_number', 'rha_status', 'rha_indicator', 'registered_name', 'recessive_codes',
                 'sire_breed_code', 'sire_country_code', 'sire_id_number', 'sire_rha_status', 'sire_rha_indicator',
                 'sire_registered_name', 'sire_recessive_codes', 'dam_breed_code', 'dam_country_code', 'dam_id_number',
                 'dam_rha_status', 'dam_rha_indicator', 'dam_registered_name', 'dam_recessive_codes', 'birth_date',
                 'naab_code', 'naab_semen_status', 'ptat_summary_date', 'sta_ptat', 'ptat', 'rel_ptat', 'num_recs_ptat',
                 'num_daus', 'num_grade_daus', 'num_herds', 'num_herd_year_class', 'eff_daus_herd', 'avg_daus_final_score',
                 'daus_avg_age_adj_score', 'daus_sd_age_adj_score', 'daus_num_states_class', 'daus_num_years_class', 'linear_proof_ind',
                 'linear_date', 'linear_rel', 'num_linear_recs', 'num_linear_daus', 'num_linear_herds', 'num_linear_herd_year_class',
                 'linear_eff_daus_herd', 'sta_stature', 'daus_linear_avg_age_adj_score_stature', 'daus_linear_sd_age_adj_score_stature',
                 'sta_strength', 'daus_linear_avg_age_adj_score_strength', 'daus_linear_sd_age_adj_score_strength',
                 'sta_body_depth', 'daus_linear_avg_age_adj_score_body_depth', 'daus_linear_sd_age_adj_score_body_depth',
                 'sta_dairy_form', 'daus_linear_avg_age_adj_score_dairy_form', 'daus_linear_sd_age_adj_score_dairy_form',
                 'sta_rump_angle', 'daus_linear_avg_age_adj_score_rump_angle', 'daus_linear_sd_age_adj_score_rump_angle',
                 'sta_thurl_width', 'daus_linear_avg_age_adj_score_thurl_width', 'daus_linear_sd_age_adj_score_thurl_width',
                 'sta_rear_legs_side', 'daus_linear_avg_age_adj_score_rear_legs_side', 'daus_linear_sd_age_adj_score_rear_legs_side',
                 'sta_rear_legs_rear', 'daus_linear_avg_age_adj_score_rear_legs_rear', 'daus_linear_sd_age_adj_score_rear_legs_rear',
                 'sta_foot_angle', 'daus_linear_avg_age_adj_score_foot_angle', 'daus_linear_sd_age_adj_score_foot_angle',
                 'sta_feet_legs_score', 'daus_linear_avg_age_adj_score_feet_legs_score', 'daus_linear_sd_age_adj_score_feet_legs_score',
                 'sta_fore_udder_attachment', 'daus_linear_avg_age_adj_score_fore_udder_attachment', 'daus_linear_sd_age_adj_score_fore_udder_attachment',
                 'sta_rear_udder_height', 'daus_linear_avg_age_adj_score_rear_udder_height', 'daus_linear_sd_age_adj_score_rear_udder_height',
                 'sta_rear_udder_width', 'daus_linear_avg_age_adj_score_rear_udder_width', 'daus_linear_sd_age_adj_score_rear_udder_width',
                 'sta_udder_cleft', 'daus_linear_avg_age_adj_score_udder_cleft', 'daus_linear_sd_age_adj_score_udder_cleft',
                 'sta_udder_depth', 'daus_linear_avg_age_adj_score_udder_depth', 'daus_linear_sd_age_adj_score_udder_depth',
                 'sta_front_teat_placement', 'daus_linear_avg_age_adj_score_front_teat_placement', 'daus_linear_sd_age_adj_score_front_teat_placement',
                 'sta_rear_teat_placement', 'daus_linear_avg_age_adj_score_rear_teat_placement', 'daus_linear_sd_age_adj_score_rear_teat_placement',
                 'sta_teat_length', 'daus_linear_avg_age_adj_score_teat_length', 'daus_linear_sd_age_adj_score_teat_length',
                 'pta_udd', 'pta_fl', 'pta_size', 'dairy_composite', 'origin_of_evaluation'
                 ],
        colspecs = [
                    (0,2), (2,5), (5,17), (17,20), (20,22), (22,52), (52,68), (68,70), (70,73), (73,85), (85,88), (88,90),
                    (90,120), (120,136), (136,138), (138,141), (141,153), (153,156), (156,158), (158,188), (188,204),
                    (204,212), (212,222), (222,223), (223,229), (229,234), (234,239), (239,241), (241,247), (247,253),
                    (253,258), (258,263), (263,268), (268,273), (273,277), (277,281), (281,285), (285,287), (287,289),
                    (289,290), (290,296), (296,298), (298,304), (304,310), (310,315), (315,320), (320,325), (325,330),
                    (330,334), (334,338), (338,343), (343,347), (347,351), (351,356), (356,360), (360,364), (364,369),
                    (369,373), (373,377), (377,382), (382,386), (386,390), (390,395), (395,399), (399,403), (403,408),
                    (408,412), (412,416), (416,421), (421,425), (425,429), (429,434), (434,438), (438,442), (442,447),
                    (447,451), (451,455), (455,460), (460,464), (464,468), (468,473), (473,477), (477,481), (481,486),
                    (486,490), (490,494), (494,499), (499,503), (503,507), (507,512), (512,516), (516,520), (520,525),
                    (525,529), (529,533), (533,538), (538,542), (542,546), (546,551), (551,555), (555,559), (559,564),
                    (564,569), (569,574), (574,579), (579,580)
                   ],
        header = None,
        index_col = False,
        encoding='ISO-8859-1',            # This encoding is important because the file may include non-ASCII characters, such as Â (A with a caret over it).
        #nrows = 2500,
        #na_filter = True,
        #na_values=[' '],
    )

    # We've got some NaNs in the data that we have to deal with.
    typeall['rha_status'].fillna(-1, inplace=True)
    typeall['sire_rha_status'].fillna(-1, inplace=True)
    typeall['dam_rha_status'].fillna(-1, inplace=True)
    typeall['rel_ptat'].fillna(-1, inplace=True)
    typeall['num_recs_ptat'].fillna(-1, inplace=True)
    typeall['num_daus'].fillna(-1, inplace=True)
    typeall['num_grade_daus'].fillna(-1, inplace=True)
    typeall['num_herds'].fillna(-1, inplace=True)
    typeall['num_herd_year_class'].fillna(-1, inplace=True)
    typeall['daus_num_states_class'].fillna(-1, inplace=True)
    typeall['daus_num_years_class'].fillna(-1, inplace=True)
    typeall['linear_rel'].fillna(-1, inplace=True)
    typeall['num_linear_recs'].fillna(-1, inplace=True)
    typeall['num_linear_daus'].fillna(-1, inplace=True)
    typeall['num_linear_herds'].fillna(-1, inplace=True)
    typeall['num_linear_herd_year_class'].fillna(-1, inplace=True)
        
    #print(typeall['linear_eff_daus_herd'].unique())
    #print(typeall[typeall['dairy_composite'] == '1.28G'])
    
    # Cast the floats and integers from strings to the correct type.
    typeall['id17'] = typeall['breed_code'].map(str) + typeall['country_code'].map(str) + typeall['id_number'].map(str)
    typeall['sire_rha_status'] = typeall['sire_rha_status'].astype('int')
    typeall['dam_rha_status'] = typeall['dam_rha_status'].astype('int')
    typeall['sta_ptat'] = typeall['sta_ptat'].astype('float')
    typeall['ptat'] = typeall['ptat'].astype('float')
    typeall['rel_ptat'] = typeall['rel_ptat'].astype('int')
    typeall['num_recs_ptat'] = typeall['num_recs_ptat'].astype('int')
    typeall['num_daus'] = typeall['num_daus'].astype('int')
    typeall['num_grade_daus'] = typeall['num_grade_daus'].astype('int')
    typeall['num_herds'] = typeall['num_herds'].astype('int')
    typeall['num_herd_year_class'] = typeall['num_herd_year_class'].astype('int')
    typeall['eff_daus_herd'] = typeall['eff_daus_herd'].astype('float')
    typeall['avg_daus_final_score'] = typeall['avg_daus_final_score'].astype('float')
    typeall['daus_avg_age_adj_score'] = typeall['daus_avg_age_adj_score'].astype('float')
    typeall['daus_sd_age_adj_score'] = typeall['daus_sd_age_adj_score'].astype('float')
    typeall['daus_num_states_class'] = typeall['daus_num_states_class'].astype('int')
    typeall['daus_num_years_class'] = typeall['daus_num_years_class'].astype('int')
    typeall['linear_rel'] = typeall['linear_rel'].astype('int')
    typeall['num_linear_recs'] = typeall['num_linear_recs'].astype('int')
    typeall['num_linear_daus'] = typeall['num_linear_daus'].astype('int')
    typeall['num_linear_herds'] = typeall['num_linear_herds'].astype('int')
    typeall['num_linear_herd_year_class'] = typeall['num_linear_herd_year_class'].astype('int')
    typeall['linear_eff_daus_herd'] = typeall['linear_eff_daus_herd'].astype('float')
    typeall['sta_stature'] = typeall['sta_stature'].astype('float')
    typeall['daus_linear_avg_age_adj_score_stature'] = typeall['daus_linear_avg_age_adj_score_stature'].astype('float')
    typeall['daus_linear_sd_age_adj_score_stature'] = typeall['daus_linear_sd_age_adj_score_stature'].astype('float')
    typeall['sta_strength'] = typeall['sta_strength'].astype('float')
    typeall['daus_linear_avg_age_adj_score_strength'] = typeall['daus_linear_avg_age_adj_score_strength'].astype('float')
    typeall['daus_linear_sd_age_adj_score_strength'] = typeall['daus_linear_sd_age_adj_score_strength'].astype('float')
    typeall['sta_body_depth'] = typeall['sta_body_depth'].astype('float')
    typeall['daus_linear_avg_age_adj_score_body_depth'] = typeall['daus_linear_avg_age_adj_score_body_depth'].astype('float')
    typeall['daus_linear_sd_age_adj_score_body_depth'] = typeall['daus_linear_sd_age_adj_score_body_depth'].astype('float')
    typeall['sta_dairy_form'] = typeall['sta_dairy_form'].astype('float')
    typeall['daus_linear_avg_age_adj_score_dairy_form'] = typeall['daus_linear_avg_age_adj_score_dairy_form'].astype('float')
    typeall['daus_linear_sd_age_adj_score_dairy_form'] = typeall['daus_linear_sd_age_adj_score_dairy_form'].astype('float')
    typeall['sta_rump_angle'] = typeall['sta_rump_angle'].astype('float')
    typeall['daus_linear_avg_age_adj_score_rump_angle'] = typeall['daus_linear_avg_age_adj_score_rump_angle'].astype('float')
    typeall['daus_linear_sd_age_adj_score_rump_angle'] = typeall['daus_linear_sd_age_adj_score_rump_angle'].astype('float')
    typeall['sta_thurl_width'] = typeall['sta_thurl_width'].astype('float')
    typeall['daus_linear_avg_age_adj_score_thurl_width'] = typeall['daus_linear_avg_age_adj_score_thurl_width'].astype('float')
    typeall['daus_linear_sd_age_adj_score_thurl_width'] = typeall['daus_linear_sd_age_adj_score_thurl_width'].astype('float')
    typeall['sta_rear_legs_side'] = typeall['sta_rear_legs_side'].astype('float')
    typeall['daus_linear_avg_age_adj_score_rear_legs_side'] = typeall['daus_linear_avg_age_adj_score_rear_legs_side'].astype('float')
    typeall['daus_linear_sd_age_adj_score_rear_legs_side'] = typeall['daus_linear_sd_age_adj_score_rear_legs_side'].astype('float')
    typeall['sta_rear_legs_rear'] = typeall['sta_rear_legs_rear'].astype('float')
    typeall['daus_linear_avg_age_adj_score_rear_legs_rear'] = typeall['daus_linear_avg_age_adj_score_rear_legs_rear'].astype('float')
    typeall['daus_linear_sd_age_adj_score_rear_legs_rear'] = typeall['daus_linear_sd_age_adj_score_rear_legs_rear'].astype('float')
    typeall['sta_foot_angle'] = typeall['sta_foot_angle'].astype('float')
    typeall['daus_linear_avg_age_adj_score_foot_angle'] = typeall['daus_linear_avg_age_adj_score_foot_angle'].astype('float')
    typeall['daus_linear_sd_age_adj_score_foot_angle'] = typeall['daus_linear_sd_age_adj_score_foot_angle'].astype('float')
    typeall['sta_feet_legs_score'] = typeall['sta_feet_legs_score'].astype('float')
    typeall['daus_linear_avg_age_adj_score_feet_legs_score'] = typeall['daus_linear_avg_age_adj_score_feet_legs_score'].astype('float')
    typeall['daus_linear_sd_age_adj_score_feet_legs_score'] = typeall['daus_linear_sd_age_adj_score_feet_legs_score'].astype('float')
    typeall['sta_fore_udder_attachment'] = typeall['sta_fore_udder_attachment'].astype('float')
    typeall['daus_linear_avg_age_adj_score_fore_udder_attachment'] = typeall['daus_linear_avg_age_adj_score_fore_udder_attachment'].astype('float')
    typeall['daus_linear_sd_age_adj_score_fore_udder_attachment'] = typeall['daus_linear_sd_age_adj_score_fore_udder_attachment'].astype('float')
    typeall['sta_rear_udder_height'] = typeall['sta_rear_udder_height'].astype('float')
    typeall['daus_linear_avg_age_adj_score_rear_udder_height'] = typeall['daus_linear_avg_age_adj_score_rear_udder_height'].astype('float')
    typeall['daus_linear_sd_age_adj_score_rear_udder_height'] = typeall['daus_linear_sd_age_adj_score_rear_udder_height'].astype('float')
    typeall['sta_rear_udder_width'] = typeall['sta_rear_udder_width'].astype('float')
    typeall['daus_linear_avg_age_adj_score_rear_udder_width'] = typeall['daus_linear_avg_age_adj_score_rear_udder_width'].astype('float')
    typeall['daus_linear_sd_age_adj_score_rear_udder_width'] = typeall['daus_linear_sd_age_adj_score_rear_udder_width'].astype('float')
    typeall['sta_udder_cleft'] = typeall['sta_udder_cleft'].astype('float')
    typeall['daus_linear_avg_age_adj_score_udder_cleft'] = typeall['daus_linear_avg_age_adj_score_udder_cleft'].astype('float')
    typeall['daus_linear_sd_age_adj_score_udder_cleft'] = typeall['daus_linear_sd_age_adj_score_udder_cleft'].astype('float')
    typeall['sta_udder_depth'] = typeall['sta_udder_depth'].astype('float')
    typeall['daus_linear_avg_age_adj_score_udder_depth'] = typeall['daus_linear_avg_age_adj_score_udder_depth'].astype('float')
    typeall['daus_linear_sd_age_adj_score_udder_depth'] = typeall['daus_linear_sd_age_adj_score_udder_depth'].astype('float')
    typeall['sta_front_teat_placement'] = typeall['sta_front_teat_placement'].astype('float')
    typeall['daus_linear_avg_age_adj_score_front_teat_placement'] = typeall['daus_linear_avg_age_adj_score_front_teat_placement'].astype('float')
    typeall['daus_linear_sd_age_adj_score_front_teat_placement'] = typeall['daus_linear_sd_age_adj_score_front_teat_placement'].astype('float')
    typeall['sta_rear_teat_placement'] = typeall['sta_rear_teat_placement'].astype('float')
    typeall['daus_linear_avg_age_adj_score_rear_teat_placement'] = typeall['daus_linear_avg_age_adj_score_rear_teat_placement'].astype('float')
    typeall['daus_linear_sd_age_adj_score_rear_teat_placement'] = typeall['daus_linear_sd_age_adj_score_rear_teat_placement'].astype('float')
    typeall['sta_teat_length'] = typeall['sta_teat_length'].astype('float')
    typeall['daus_linear_avg_age_adj_score_teat_length'] = typeall['daus_linear_avg_age_adj_score_teat_length'].astype('float')
    typeall['daus_linear_sd_age_adj_score_teat_length'] = typeall['daus_linear_sd_age_adj_score_teat_length'].astype('float')
    typeall['pta_udd'] = typeall['pta_udd'].astype('float')
    typeall['pta_fl'] = typeall['pta_fl'].astype('float')
    typeall['pta_size'] = typeall['pta_size'].astype('float')
    typeall['dairy_composite'] = typeall['dairy_composite'].astype('float')
    
    #print(typeall.head())
    
    return typeall

In [12]:
# typecomp = read_typecomp('PTA/typecomp.all')
# typecomp = read_typeall('PTA/typeall.original')
typecomp = read_typeall('PTA/2408/typeall')

In [13]:
fmt38 = read_format_38('PTA/2408/38alloff.824')

In [22]:
fmtCT = read_format_CT('PTA/2408/2408.CTeval.itb')

In [24]:
#pd.set_option('display.max_columns', None)
fmt38.head()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/IPython/core/formatters.py", line 223, in catch_format_error
    r = method(self, *args, **kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/IPython/core/formatters.py", line 344, in __call__
    return method()
           ^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/core/frame.py", line 1175, in _repr_html_
    # check whether repr fits horizontal by actually checking
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/format.py", line 1074, in to_html
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/html.py", line 88, in to_string
    lines = self.render()
            ^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/html.py", line 644, in re

   species_code breed_eval breed_code country_code     id_number  \
0             0         AY         AY          840  003002012784   
1             0         AY         AY          840  003006323920   
2             0         AY         AY          840  003006323924   
3             0         AY         AY          840  003007525611   
4             0         AY         AY          840  003007525668   

  sire_breed_code sire_country_code sire_id_number dam_breed_code  \
0              AY               USA   000100695362             AY   
1              AY               USA   000000144802             AY   
2              AY               USA   000000145313             AY   
3              AY               CAN   000101034046             AY   
4              AY               CAN   000101034046             AY   

  dam_country_code dam_id_number mgs_breed_code mgs_country_code  \
0              USA  000100579596             AY              USA   
1              840  003000295965        

In [25]:
all_traits = fmt38.merge(typecomp, left_on='id17', right_on='id17', how='inner')

In [26]:
all_traits = all_traits.merge(fmtCT, left_on='id17', right_on='id17', how='inner')

In [27]:
all_traits.head()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/IPython/core/formatters.py", line 223, in catch_format_error
    r = method(self, *args, **kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/IPython/core/formatters.py", line 344, in __call__
    return method()
           ^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/core/frame.py", line 1175, in _repr_html_
    # check whether repr fits horizontal by actually checking
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/format.py", line 1074, in to_html
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/html.py", line 88, in to_string
    lines = self.render()
            ^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/html.py", line 644, in re

   species_code breed_eval_x breed_code_x country_code_x   id_number_x  \
0             0           HO           HO            840  003000247244   
1             0           HO           HO            840  003000336265   
2             0           HO           HO            840  003000336289   
3             0           HO           HO            840  003000336293   
4             0           HO           HO            840  003000336295   

  sire_breed_code_x sire_country_code_x sire_id_number_x dam_breed_code_x  \
0                HO                 USA     000131688542               HO   
1                HO                 USA     000130312341               HO   
2                HO                 USA     000130312341               HO   
3                HO                 USA     000130588960               HO   
4                HO                 USA     000129800008               HO   

  dam_country_code_x dam_id_number_x mgs_breed_code mgs_country_code  \
0                USA

In [28]:
new_traits = all_traits.copy()

In [29]:
new_traits['pta_rfi'] = -1. * ( new_traits['pta_fsav'] + 151.8 * new_traits['pta_size'] )
new_traits['rel_pta_rfi'] = ( new_traits['rel_pta_fsav'] - ( 0.367 * new_traits['linear_rel'] ) ) / 0.633

In [30]:
new_traits.head()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/IPython/core/formatters.py", line 223, in catch_format_error
    r = method(self, *args, **kwargs)
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/IPython/core/formatters.py", line 344, in __call__
    return method()
           ^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/core/frame.py", line 1175, in _repr_html_
    # check whether repr fits horizontal by actually checking
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/format.py", line 1074, in to_html
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/html.py", line 88, in to_string
    lines = self.render()
            ^^^^^^^^^^^^^
  File "/opt/homebrew/anaconda3/lib/python3.11/site-packages/pandas/io/formats/html.py", line 644, in re

   species_code breed_eval_x breed_code_x country_code_x   id_number_x  \
0             0           HO           HO            840  003000247244   
1             0           HO           HO            840  003000336265   
2             0           HO           HO            840  003000336289   
3             0           HO           HO            840  003000336293   
4             0           HO           HO            840  003000336295   

  sire_breed_code_x sire_country_code_x sire_id_number_x dam_breed_code_x  \
0                HO                 USA     000131688542               HO   
1                HO                 USA     000130312341               HO   
2                HO                 USA     000130312341               HO   
3                HO                 USA     000130588960               HO   
4                HO                 USA     000129800008               HO   

  dam_country_code_x dam_id_number_x mgs_breed_code mgs_country_code  \
0                USA

In [32]:
new_traits.to_pickle('PTA/2408/all_traits.pkl')