Skip to content

Commit

Permalink
Ensure consistent MH frequencies (#150)
Browse files Browse the repository at this point in the history
  • Loading branch information
standage committed Oct 23, 2023
1 parent 4daf19d commit 668fdb3
Show file tree
Hide file tree
Showing 23 changed files with 646 additions and 564 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ This project adheres to [Semantic Versioning](http://semver.org/).
### Changed
- Merged RSIDs resolved during database build now propagated to the final marker definition (see #149).

### Fixed
- Added manual and automated fixes to ensure frequencies are formatted correcly and matche to the correct marker definition (see #150).


## [0.10.1] 2023-10-13

Expand Down
2 changes: 1 addition & 1 deletion dbbuild/build-summary.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3034,7 +3034,7 @@ Marker mh11KK-040 as defined in Staadig2021 was defined previously and is redund
- 59 marker definitions based on 658 SNPs
[Byrska-Bishop2022]
- 31 population samples
- frequencies for 59232 distinct haplotypes in 31 populations; 856064 total frequencies
- frequencies for 59233 distinct haplotypes in 31 populations; 856067 total frequencies
[Fan2022]
- 22 marker definitions based on 338 SNPs and 25 indels
[Yu2022G1]
Expand Down
22 changes: 19 additions & 3 deletions dbbuild/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def main(
index.update_marker_names()
index.markers.to_csv("marker.csv", index=False)
index.indels.to_csv("indels.csv", index=False)
index.frequencies.to_csv(
"frequency.csv.gz", index=False, float_format="%.5f", compression="gzip"
)
frequencies = index.frequencies
frequencies = cleanup_frequencies(frequencies)
frequencies.to_csv("frequency.csv.gz", index=False, float_format="%.5f", compression="gzip")
index.populations.to_csv("population.csv", index=False)
index.merges.to_csv("merged.csv", index=False)
repeats = flag_repeats(Path(rmsk_path) / "rmsk.txt.gz", "marker.csv", delta=25)
Expand Down Expand Up @@ -60,6 +60,22 @@ def validate_paths(dbsnp_path, rmsk_path, chain_path):
raise FileNotFoundError(",".join(missing))


def cleanup_frequencies(freq):
freq["NumVars"] = freq.Allele.apply(lambda x: x.count("|") + 1)
freq.loc[(freq.Marker == "mh01NK-001") & (freq.Source == "Kidd2018"), "Marker"] = "mh01NH-01.v2"
freq.loc[(freq.Marker == "mh01NK-001") & (freq.Source == "Turchi2019"), "Marker"] = "mh01NH-01.v2"
freq.loc[(freq.Marker == "mh01NK-001") & (freq.Source == "Gandotra2020"), "Marker"] = "mh01NH-01.v2"
freq.loc[(freq.Marker == "mh01NK-001") & (freq.Source == "Staadig2021"), "Marker"] = "mh01NH-01.v1"
freq.loc[(freq.Marker.str.startswith(("mh05KK-023", "mh05KK-020"))) & (freq.Source == "Kidd2018") & (freq.NumVars == 3), "Marker"] = "mh05KK-023.v1"
freq.loc[(freq.Marker.str.startswith(("mh05KK-023", "mh05KK-020"))) & (freq.Source == "Kidd2018") & (freq.NumVars == 4), "Marker"] = "mh05KK-023.v2"
freq.loc[(freq.Marker.str.startswith(("mh05KK-023", "mh05KK-020"))) & (freq.Source == "Turchi2019"), "Marker"] = "mh05KK-023.v1"
freq.loc[(freq.Marker.str.startswith(("mh05KK-023", "mh05KK-020"))) & (freq.Source == "Gandotra2020"), "Marker"] = "mh05KK-023.v3"
freq.loc[(freq.Marker.str.startswith("mh05KK-120")) & (freq.Source == "Kidd2018") & (freq.NumVars == 3), "Marker"] = "mh05KK-120.v1"
freq.loc[(freq.Marker.str.startswith("mh05KK-120")) & (freq.Source == "Kidd2018") & (freq.NumVars == 4), "Marker"] = "mh05KK-120.v2"
freq = freq.drop(columns="NumVars")
return freq


def get_parser():
parser = ArgumentParser(description="MicroHapDB database build procedure")
parser.add_argument("dbsnp_path")
Expand Down
Binary file modified dbbuild/frequency.csv.gz
Binary file not shown.
4 changes: 2 additions & 2 deletions dbbuild/marker-aes.csv
Original file line number Diff line number Diff line change
Expand Up @@ -36048,8 +36048,8 @@ mh06WL-017.v1,MXL,6.983
mh06WL-017.v1,PEL,7.292
mh06WL-017.v1,PJL,6.547
mh06WL-017.v1,PUR,7.729
mh06WL-017.v1,SAS,8.805
mh06WL-017.v1,STU,10.080
mh06WL-017.v1,SAS,8.806
mh06WL-017.v1,STU,10.085
mh06WL-017.v1,TSI,8.339
mh06WL-017.v1,YRI,6.634
mh06WL-017.v2,1KGP,8.679
Expand Down
2 changes: 1 addition & 1 deletion dbbuild/marker.csv
Original file line number Diff line number Diff line change
Expand Up @@ -1109,7 +1109,7 @@ mh06SCUZJ-0278537,4,130,chr6,32588674,32588803,32588674;32588684;32588765;325888
mh06SHY-003,12,91,chr6,32605762,32605852,32605762;32605766;32605777;32605783;32605785;32605788;32605797;32605821;32605825;32605844;32605851;32605852,32573539;32573543;32573554;32573560;32573562;32573565;32573574;32573598;32573602;32573621;32573628;32573629,rs602427;rs9270967;rs9270968;rs9270969;rs602457;rs573809619;rs9270970;rs9270971;rs2760993;rs9270972;rs144123929;rs602875,Wu2021
mh06WL-050,27,300,chr6,32661208,32661507,32661208;32661214;32661215;32661219;32661221;32661238;32661245;32661250;32661255;32661269;32661287;32661289;32661297;32661310;32661314;32661332;32661333;32661334;32661335;32661462;32661463;32661482;32661490;32661492;32661504;32661505;32661507,32628985;32628991;32628992;32628996;32628998;32629015;32629022;32629027;32629032;32629046;32629064;32629066;32629074;32629087;32629091;32629109;32629110;32629111;32629112;32629239;32629240;32629259;32629267;32629269;32629281;32629282;32629284,rs28724236;rs117369774;rs117656997;rs9273560;rs9273561;rs281864341;rs281864337;rs41270893;rs73729446;rs28724237;rs9273584;rs28724238;rs9273588;rs117626117;rs9273595;rs281864289;rs9273608;rs281864287;rs9273609;rs559673115;rs281864275;rs3828786;rs199987570;rs9273685;rs9273695;rs9273696;rs114262759,Yu2022G3
mh06WL-017.v2,16,100,chr6,32663167,32663266,32663167;32663168;32663169;32663172;32663190;32663191;32663214;32663218;32663219;32663222;32663223;32663252;32663261;32663262;32663265;32663266,32630944;32630945;32630946;32630949;32630967;32630968;32630991;32630995;32630996;32630999;32631000;32631029;32631038;32631039;32631042;32631043,rs71542446;rs41270931;rs28724261;rs71542447;rs2854267;rs9274216;rs35986240;rs201930518;rs17613599;rs71542448;rs9274217;rs58770498;rs17613606;rs9274218;rs115495316;rs9274219,Yu2022G4
mh06WL-017.v1,32,199,chr6,32663167,32663365,32663167;32663168;32663169;32663172;32663190;32663191;32663214;32663218;32663219;32663222;32663223;32663252;32663252;32663261;32663262;32663265;32663266;32663267;32663281;32663288;32663293;32663298;32663302;32663303;32663327;32663332;32663336;32663342;32663352;32663353;32663356;32663365,32630944;32630945;32630946;32630949;32630967;32630968;32630991;32630995;32630996;32630999;32631000;32631029;32631029;32631038;32631039;32631042;32631043;32631044;32631058;32631065;32631070;32631075;32631079;32631080;32631104;32631109;32631113;32631119;32631129;32631130;32631133;32631142,rs71542446;rs41270931;rs28724261;rs71542447;rs2854267;rs9274216;rs35986240;rs201930518;rs17613599;rs71542448;rs9274217;rs58770498;rs58770498;rs17613606;rs9274218;rs115495316;rs9274219;rs9274220;rs2856703;rs17843723;rs41270932;rs9274222;rs72844333;rs200098349;rs74222206;rs281863378;rs9274225;rs41270933;rs17613629;rs17613636;rs281863362;rs9274227,Yu2022G1;Yu2022G2
mh06WL-017.v1,32,199,chr6,32663167,32663365,32663167;32663168;32663169;32663172;32663190;32663191;32663214;32663218;32663219;32663222;32663223;32663252;32663261;32663262;32663265;32663266;32663267;32663281;32663293;32663298;32663302;32663303;32663309;32663327;32663332;32663336;32663342;32663345;32663352;32663353;32663356;32663365,32630944;32630945;32630946;32630949;32630967;32630968;32630991;32630995;32630996;32630999;32631000;32631029;32631038;32631039;32631042;32631043;32631044;32631058;32631070;32631075;32631079;32631080;32631086;32631104;32631109;32631113;32631119;32631122;32631129;32631130;32631133;32631142,rs71542446;rs41270931;rs28724261;rs71542447;rs2854267;rs9274216;rs35986240;rs201930518;rs17613599;rs71542448;rs9274217;rs58770498;rs17613606;rs9274218;rs115495316;rs9274219;rs9274220;rs2856703;rs41270932;rs9274222;rs72844333;rs200098349;rs17843724;rs74222206;rs281863378;rs9274225;rs41270933;rs17613622;rs17613629;rs17613636;rs281863362;rs9274227,Yu2022G1;Yu2022G2
mh06LV-09,14,193,chr6,33056355,33056547,33056355;33056360;33056389;33056401;33056412;33056447;33056452;33056458;33056481;33056496;33056522;33056532;33056540;33056547,33024132;33024137;33024166;33024178;33024189;33024224;33024229;33024235;33024258;33024273;33024299;33024309;33024317;33024324,rs412735;rs570318069;rs449635;rs114484337;rs150884041;rs6457707;rs396638;rs6457708;rs12200756;rs435549;rs3130593;rs3130184;rs3130185;rs3130594,Voskoboinik2018
mh06WL-070,5,85,chr6,33893893,33893977,33893893;33893902;33893916;33893934;33893977,33861670;33861679;33861693;33861711;33861754,rs3800333;rs2031377;rs3800335;rs2031378;rs3800337,Yu2022G1
mh06WL-075,5,134,chr6,36734497,36734630,36734497;36734507;36734602;36734610;36734630,36702274;36702284;36702379;36702387;36702407,rs236453;rs236452;rs236451;rs77394127;rs236450,Yu2022G1
Expand Down
Loading

0 comments on commit 668fdb3

Please sign in to comment.