In [1]:
import argparse, requests, re
import html
import pandas as pd
from os.path import join, isdir
import numpy as np

## Combine the fetched per-locus AFND data

AFND_data_locus_*.csv files were fetched with the `AFND_get_data_per_locus.ipynb` notebook. Here we clombine this clean data into a single list.

1. We load the per-locus data
2. We clean this data keeping only the populations that have information for each of the selected loci

**Output:** The final allele-frequency data is saved to the `AFND_data_locus_all.csv` file. 

In [2]:
loci = ["A", "B", "C"]

full_df = None

for locus in loci:
    tmp_df = pd.read_csv("AFND_data_locus_{}.csv".format(locus))
    if full_df is None:
        full_df = tmp_df
    else:
        full_df = pd.concat([full_df, tmp_df])

In [3]:
full_df = full_df.reset_index()

full_df =full_df[full_df.columns[2:]]

In [4]:
full_df["Locus"] = full_df.Allele.apply(lambda x: x[0])

In [5]:
full_df

Unnamed: 0,Population,Allele,% of individuals that have the allele,Allele Frequency,Sample Size,Location,Allele - original,country,continent,Locus
0,American Samoa,A*02:01,,0.130,51,14_18_S_170_42_W,A*02:01,American Samoa,Oceania,A
1,American Samoa,A*02:06,,0.130,51,14_18_S_170_42_W,A*02:06,American Samoa,Oceania,A
2,American Samoa,A*03:01,,0.020,51,14_18_S_170_42_W,A*03:01,American Samoa,Oceania,A
3,American Samoa,A*11:01,,0.160,51,14_18_S_170_42_W,A*11:01,American Samoa,Oceania,A
4,American Samoa,A*11:04,,0.010,51,14_18_S_170_42_W,A*11:04,American Samoa,Oceania,A
...,...,...,...,...,...,...,...,...,...,...
66752,Zimbabwe Harare Shona,C*14:03,,0.004,230,17_51_S_31_1_E,C*14:03,Zimbabwe,Africa,C
66753,Zimbabwe Harare Shona,C*15:05,,0.002,230,17_51_S_31_1_E,C*15:05,Zimbabwe,Africa,C
66754,Zimbabwe Harare Shona,C*16:01,,0.080,230,17_51_S_31_1_E,C*16:01,Zimbabwe,Africa,C
66755,Zimbabwe Harare Shona,C*17:01,,0.082,230,17_51_S_31_1_E,C*17:01,Zimbabwe,Africa,C


### Keep only the populations that have data for all 3 loci

In [6]:
# keep only populations with all loci!!
uncovered_populations = []
for pop in full_df.Population.unique():
    print(pop)
    tmp = full_df[full_df.Population==pop].groupby(["Locus"])["Allele Frequency"].sum()
    print(tmp)
    covered_loci = list(tmp.reset_index().Locus)
    print(list(tmp.reset_index().Locus))
    if not ("A" in covered_loci \
       and "B" in covered_loci \
       and "C" in covered_loci):
        uncovered_populations.append(pop)

American Samoa
Locus
A    1.00
B    1.00
C    0.97
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Argentina Gran Chaco Eastern Toba
Locus
A    0.971
B    0.952
Name: Allele Frequency, dtype: float64
['A', 'B']
Argentina Gran Chaco Mataco Wichi
Locus
A    1.00
B    0.99
Name: Allele Frequency, dtype: float64
['A', 'B']
Argentina Gran Chaco Western Toba Pilaga
Locus
A    0.950
B    0.964
Name: Allele Frequency, dtype: float64
['A', 'B']
Argentina Rosario Toba
Locus
A    0.982
B    1.002
Name: Allele Frequency, dtype: float64
['A', 'B']
Armenia combined Regions
Locus
A    1.00
B    0.98
Name: Allele Frequency, dtype: float64
['A', 'B']
Australia Cape York Peninsula Aborigine
Locus
A    1.000
B    1.000
C    1.001
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Australia Groote Eylandt Aborigine
Locus
A    1.001
B    1.002
C    1.001
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Australia Kimberly Aborigine
Locus
A    1.000
B    0.999
C    1.001
Name: Allele Frequen

Locus
A    0.903
Name: Allele Frequency, dtype: float64
['A']
Czech Republic NMDR
Locus
A    1.0002
B    1.0007
C    0.9996
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Ecuador Amerindians
Locus
A    0.9994
Name: Allele Frequency, dtype: float64
['A']
England Blood Donors of Mixed Ethnicity
Locus
A    0.9927
B    0.9946
C    0.9951
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
England North West
Locus
A    1.001
B    1.000
C    0.999
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Finland
Locus
A    0.995
B    1.003
C    0.984
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
France French Bone Marrow Donor Registry
Locus
A    0.9822
B    0.9121
Name: Allele Frequency, dtype: float64
['A', 'B']
France Southeast
Locus
A    1.020
B    0.968
C    0.959
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Gaza
Locus
A    0.9997
B    0.9996
C    1.0001
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Georgia Svaneti Region Svan
Locus
A    1.005
B    0.

Locus
A    1.0
B    1.0
Name: Allele Frequency, dtype: float64
['A', 'B']
Malaysia Peninsular Chinese
Locus
A    1.0003
B    1.0010
C    1.0008
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Malaysia Peninsular Indian
Locus
A    0.9987
B    0.9992
C    0.9995
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Malaysia Peninsular Malay
Locus
A    1.0002
B    1.0001
C    1.0000
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Malaysia Sarawak Bau Bidayuh
Locus
A    1.0
B    1.0
Name: Allele Frequency, dtype: float64
['A', 'B']
Mali Bandiagara
Locus
A    0.995
B    0.994
C    1.001
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Mexico Chiapas Lacandon Mayans
Locus
A    1.0000
B    1.0004
C    1.0001
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Mexico Chihuahua Chihuahua City Pop 2
Locus
A    0.9773
B    0.9888
Name: Allele Frequency, dtype: float64
['A', 'B']
Mexico Chihuahua Tarahumara
Locus
A    0.998
B    1.000
C    0.999
Name: Allele Frequency, dtyp

Locus
A    1.0
B    1.0
Name: Allele Frequency, dtype: float64
['A', 'B']
Sweden Southern Sami
Locus
A    1.0
B    1.0
Name: Allele Frequency, dtype: float64
['A', 'B']
Taiwan Ami
Locus
A    1.001
B    1.000
C    0.999
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Taiwan Atayal
Locus
A    1.0
B    1.0
C    1.0
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Taiwan Bunun
Locus
A    1.000
B    1.001
C    1.000
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Taiwan Hakka
Locus
A    0.999
B    0.995
C    0.998
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Taiwan Han Chinese
Locus
A    1.000
B    1.001
C    1.000
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Taiwan Minnan pop 1
Locus
A    1.000
B    1.003
C    1.000
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Taiwan Paiwan
Locus
A    1.000
B    1.000
C    1.001
Name: Allele Frequency, dtype: float64
['A', 'B', 'C']
Taiwan Pazeh
Locus
A    0.999
B    0.997
C    0.998
Name: Allele Frequency,

Locus
B    0.9844
C    0.9961
Name: Allele Frequency, dtype: float64
['B', 'C']
Thailand
Locus
B    0.960
C    1.001
Name: Allele Frequency, dtype: float64
['B', 'C']
Thailand pop 3
Locus
B    0.985
Name: Allele Frequency, dtype: float64
['B']
USA African American pop 8
Locus
B    1.008
Name: Allele Frequency, dtype: float64
['B']
USA European American pop 2
Locus
B    0.99
Name: Allele Frequency, dtype: float64
['B']
USA NMDP Black South or Central American
Locus
B    1.0273
Name: Allele Frequency, dtype: float64
['B']
China North Han pop 2
Locus
C    1.0003
Name: Allele Frequency, dtype: float64
['C']
China South Han pop 2
Locus
C    1.0007
Name: Allele Frequency, dtype: float64
['C']
Italy North
Locus
C    1.0
Name: Allele Frequency, dtype: float64
['C']
Italy South
Locus
C    1.002
Name: Allele Frequency, dtype: float64
['C']
Japan Kyoto and Osaka
Locus
C    0.984
Name: Allele Frequency, dtype: float64
['C']
Japan pop 1
Locus
C    0.999
Name: Allele Frequency, dtype: float64
['C']


In [7]:
clean_full_data = full_df[~full_df.Population.isin(uncovered_populations)].reset_index()

In [8]:
clean_full_data = clean_full_data[clean_full_data.columns[1:]]
clean_full_data

Unnamed: 0,Population,Allele,% of individuals that have the allele,Allele Frequency,Sample Size,Location,Allele - original,country,continent,Locus
0,American Samoa,A*02:01,,0.130,51,14_18_S_170_42_W,A*02:01,American Samoa,Oceania,A
1,American Samoa,A*02:06,,0.130,51,14_18_S_170_42_W,A*02:06,American Samoa,Oceania,A
2,American Samoa,A*03:01,,0.020,51,14_18_S_170_42_W,A*03:01,American Samoa,Oceania,A
3,American Samoa,A*11:01,,0.160,51,14_18_S_170_42_W,A*11:01,American Samoa,Oceania,A
4,American Samoa,A*11:04,,0.010,51,14_18_S_170_42_W,A*11:04,American Samoa,Oceania,A
...,...,...,...,...,...,...,...,...,...,...
50555,Zimbabwe Harare Shona,C*14:03,,0.004,230,17_51_S_31_1_E,C*14:03,Zimbabwe,Africa,C
50556,Zimbabwe Harare Shona,C*15:05,,0.002,230,17_51_S_31_1_E,C*15:05,Zimbabwe,Africa,C
50557,Zimbabwe Harare Shona,C*16:01,,0.080,230,17_51_S_31_1_E,C*16:01,Zimbabwe,Africa,C
50558,Zimbabwe Harare Shona,C*17:01,,0.082,230,17_51_S_31_1_E,C*17:01,Zimbabwe,Africa,C


In [9]:
clean_full_data.to_csv("AFND_data_locus_all.csv")