In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from itertools import chain
%matplotlib inline

In [None]:
from IPython.display import display

In [None]:
# load phenotypes for GWAS sample
phenostructure = pd.read_csv('/psych/genetics_data/ccarey/UKBB/ukb_files/Data_Dictionary_Showcase.csv',index_col="FieldID")
phenos = pd.read_table('/stanley/genetics/analysis/ukbb_sexdiff/family_based/ukb31063_fullsample_phesant_icd10_phenotypes_bothsexes.tsv',dtype=object,index_col="userId")
sample = pd.read_table('/psych/genetics_data/ccarey/UKBB/ukb_files/ukb_whitebritish.txt',sep=" ",index_col="IID")
phenos = phenos.loc[np.intersect1d(sample.index,phenos.index)]

In [None]:
# get shape of phenotype matrix
phenos.shape

In [None]:
# get null-nonnull matrix
phenos_notnull=phenos.notnull()

In [None]:
# only keep one column for each cat-mult
phenos_notnull_nodupes = phenos_notnull.loc[:,~phenos_notnull.columns.str.split('_').str[0].duplicated()]

In [None]:
# plot sample-level completeness
plt.hist(phenos_notnull_nodupes.apply(np.mean,axis=1),bins=50)
plt.show()

In [None]:
# plot item-level N's
plt.hist(phenos_notnull_nodupes.apply(sum,axis=0),bins=50)
plt.show()

In [None]:
# transpose df to index on items/phenos
phenos_notnull_nodupes_itemindexed = phenos_notnull_nodupes.T

In [None]:
# add path column to df
phenos_notnull_nodupes_itemindexed['path'] = phenostructure.loc[phenos_notnull_nodupes_itemindexed.index.str.split('_').str[0].values.astype(int)].Path.values

In [None]:
# create questionnaire dict
category_dict = phenos_notnull_nodupes_itemindexed.reset_index().groupby('path')['index'].apply(list).to_dict()

In [None]:
# initialize unit/questionnaire completeness df
unit_completeness = pd.DataFrame(columns=category_dict.keys(),index=phenos_notnull_nodupes.index)

In [None]:
# initialize dict of items per questionnaire
item_ns = dict()

In [None]:
# determine whether each individual attempted a questionnaire
# also create dict of items per questionnaire
for key in category_dict.keys():
    item_ns[key]=len(category_dict[key])
    unit_completeness[key] = phenos_notnull_nodupes[category_dict[key]].apply(np.any,axis=1)

In [None]:
# visualize individual "attempt" correlations across questionnaires
f, ax = plt.subplots(figsize=(15, 15))
sns.set(font_scale=0.75)
ax = sns.heatmap(unit_completeness.corr(),square=True, cbar_kws={"shrink": 0.5})

In [None]:
# copy completeness df
unit_completeness_merge = unit_completeness.copy()

In [None]:
# create dict of lowest-level categories and their parent categories
mykeys = list(unit_completeness_merge.columns.str.rsplit(">",1).str[0].str.strip())
myvals = list(unit_completeness_merge.columns)
d = dict()
for i in range(len(mykeys)):
    if(mykeys[i] in d.keys()):
        d[mykeys[i]].append(myvals[i])
    else:
        d[mykeys[i]] = [myvals[i]]

In [None]:
# for each parent category, see if can merge some lowest-level categories with high cross-questionnaire correlations
for key in d.keys():

    if len(d[key])>1:
        tempcorr = unit_completeness[d[key]].corr()
        indices = np.where(tempcorr > 0.95)
        indices = [(tempcorr.index[x], tempcorr.columns[y]) for x, y in zip(*indices)
                                        if x != y and x < y]
        if len(indices)>0:
            # create dict of highly correlated questionnaires
            tempd = dict()
            for x,y in indices:
                if(x in tempd.keys()):
                    tempd[x].append(y)
                elif(x in list(set(chain(*tempd.values())))):
                    pass
                else:
                    tempd[x] = [y]

            # merge highly correlated questionnaires
            counter = 1
            for tempkey in tempd.keys():
                addlist = tempd[tempkey]
                addlist.append(tempkey)
                newcol = " > ".join([key,"Group_"+str(counter)])
                print(newcol,addlist)
                unit_completeness_merge[newcol] = unit_completeness_merge.loc[:,addlist].apply(lambda x: np.any(x),axis=1)
                unit_completeness_merge.drop(addlist, inplace=True,axis=1)
                item_ns[newcol] = sum([item_ns[x] for x in addlist])
                for item in addlist:
                    item_ns.pop(item)
                counter = counter+1

In [None]:
# visualize individual "attempt" correlations across merged questionnaires
f, ax = plt.subplots(figsize=(15, 15))
sns.set(font_scale=0.75)
ax = sns.heatmap(unit_completeness_merge.corr(),square=True, cbar_kws={"shrink": 0.5})

In [None]:
# plot questonnaire completeness per individual
plt.hist(unit_completeness_merge.apply(sum,axis=1),bins=50)
plt.show()

In [None]:
# plot questionnaire N's
plt.hist(unit_completeness_merge.apply(sum,axis=0),bins=50)
plt.show()

In [None]:
### SPECIFY FILTERING PARAMS
Nmin = 75000
Nmax = 250000
Nitems = 5
nmar_and_sexspecific = ['UK Biobank Assessment Centre > Touchscreen > Sex-specific factors > Female-specific factors','Health-related outcomes > Cancer register','Health-related outcomes > Death register','Health-related outcomes > Hospital in-patient > Maternity > Summary Information (maternity)']
Qmiss = 1

In [None]:
# only consider questionnaires completed by between Nmin and Nmax people
unit_completeness_merge_Nmin_Nmax = unit_completeness_merge.loc[:,(unit_completeness_merge.apply(sum,axis=0)>Nmin) & (unit_completeness_merge.apply(sum,axis=0)<Nmax)]

In [None]:
# remove questionnaires consisting of less than Nitems items
unit_completeness_merge_Nmin_Nmax_Nitems = unit_completeness_merge_Nmin_Nmax.loc[:,[item_ns[x] >= Nitems for x in unit_completeness_merge_Nmin_Nmax.columns.values]]

In [None]:
# drop sex-specific and MNAR questionnaires
unit_completeness_merge_Nmin_Nmax_Nitems_dropped = unit_completeness_merge_Nmin_Nmax_Nitems.drop(np.intersect1d(nmar_and_sexspecific,unit_completeness_merge_Nmin_Nmax_Nitems.columns.values),axis=1)

In [None]:
# visualize individual "attempt" correlations across merged questionnaires
f, ax = plt.subplots(figsize=(15, 15))
ax = sns.heatmap(unit_completeness_merge_Nmin_Nmax_Nitems_dropped.corr(),square=True, cbar_kws={"shrink": 0.5})

In [None]:
# get questionnaire-level completeness patterns and their frequencies
missingness_patterns = unit_completeness_merge_Nmin_Nmax_Nitems_dropped.groupby(list(unit_completeness_merge_Nmin_Nmax_Nitems_dropped.columns), as_index=False).size().reset_index().rename(columns={0:"N"})

In [None]:
# calculate number of questionnaires included in each pattern
missingness_patterns["n_questionnaires"] = missingness_patterns.iloc[:,:-1].apply(sum,axis=1)

In [None]:
# sort completeness pattern dataframe by number of questionnaires, then N per pattern
missingness_patterns_toplot = missingness_patterns.sort_values(["n_questionnaires","N"],ascending=[False,False])

In [None]:
# define x-labels to be n_questionnaires plus N individuals
xlabs = missingness_patterns_toplot.apply(lambda x: "_".join([str(x.n_questionnaires),str(x.N)]),axis=1).tolist()

In [None]:
# define y-labels to be the category/path, plus N individuals, plus n_items
ylabs = [", N=".join([x,y,z]) for x,y,z in zip(unit_completeness_merge_Nmin_Nmax_Nitems_dropped.columns.values, unit_completeness_merge_Nmin_Nmax_Nitems_dropped.apply(sum,axis=0).astype(str).values,[str(item_ns[x]) for x in unit_completeness_merge_Nmin_Nmax_Nitems_dropped.columns.values])]

In [None]:
# plot completeness patterns
f, ax = plt.subplots(figsize=(70, 70))
sns.heatmap(~missingness_patterns_toplot.iloc[:,:-2].astype(int).T,square=True,cbar=False, xticklabels=xlabs,yticklabels=ylabs)

In [None]:
# plot questonnaire completeness per individual
qhist = plt.hist(unit_completeness_merge_Nmin_Nmax_Nitems_dropped.apply(sum,axis=1),bins=missingness_patterns_toplot.iloc[:,:-2].shape[1])
plt.show()

In [None]:
qhist[0]

In [None]:
qhist[0][len(qhist[0])-Qmiss:]

In [None]:
# get list of individuals missing Qmiss questionnaires or less
inds_Qmiss = unit_completeness_merge_Nmin_Nmax_Nitems_dropped[unit_completeness_merge_Nmin_Nmax_Nitems_dropped.apply(sum,axis=1)>=(len(qhist[0])-Qmiss)].index.values

In [None]:
# subset original phenotype file to "core" individuals
phenos_Qmiss = phenos.loc[inds_Qmiss]

In [None]:
phenos_Qmiss.shape

In [None]:
# get null-nonnull matrix
phenos_Qmiss_notnull=phenos_Qmiss.notnull()

In [None]:
# remove items that are mnar or sex-specific
remove_cats = [category_dict[x] for x in nmar_and_sexspecific]
flat_remove_cats = [val for sublist in remove_cats for val in sublist]
phenos_Qmiss_notnull_removed = phenos_Qmiss_notnull.drop(flat_remove_cats,axis=1)

In [None]:
# load phenosummary file
phenosummary = pd.read_table('/stanley/genetics/analysis/ukbb_sexdiff/family_based/ukb31063_gwas_phesant_icd10_phenotypes_summary.tsv',dtype=object)

In [None]:
# subset to the both-sexes row of all phenotypes
phenosummary_bothsexes = phenosummary[phenosummary.sex=="both"]

In [None]:
phenos_Qmiss_notnull_removed_bothsexes = phenos_Qmiss_notnull_removed.loc[:,np.intersect1d(phenos_Qmiss_notnull_removed.columns.values,phenosummary_bothsexes.id.values)]

In [None]:
2772-phenos_Qmiss_notnull_removed_bothsexes.shape[1]

In [None]:
# transpose df to index on items/phenos
phenos_Qmiss_notnull_removed_bothsexes_itemindexed = phenos_Qmiss_notnull_removed_bothsexes.T

In [None]:
# add Ns to items
phenos_Qmiss_notnull_removed_bothsexes_itemindexed['N']=phenos_Qmiss_notnull_removed_bothsexes_itemindexed.apply(sum,axis=1)

In [None]:
# add path column to df
phenos_Qmiss_notnull_removed_bothsexes_itemindexed['name'] = phenostructure.loc[phenos_Qmiss_notnull_removed_bothsexes_itemindexed.index.str.split('_').str[0].values.astype(int)].Field.values

In [None]:
# remove all phenotypes N<30k
phenos_Qmiss_notnull_removed_bothsexes_itemindexed_minN = phenos_Qmiss_notnull_removed_bothsexes_itemindexed[phenos_Qmiss_notnull_removed_bothsexes_itemindexed.N>=30000]

In [None]:
phenos_Qmiss_notnull_removed_bothsexes_itemindexed.shape

In [None]:
phenos_Qmiss_notnull_removed_bothsexes_itemindexed_minN.shape

In [None]:
phenos_inds_items = phenos_Qmiss_notnull_removed_bothsexes_itemindexed_minN.copy().T

In [None]:
phenos_inds_items = phenos_inds_items.drop(["N","name"],axis=0)

In [None]:
# subset to binary phenotypes
phenosummary_bothsexes_binary = phenosummary_bothsexes[~pd.isnull(phenosummary_bothsexes.GWAS_n_cases)]

In [None]:
# calculate Neff 
phenosummary_bothsexes_binary['Neff'] = phenosummary_bothsexes_binary.apply(lambda x: float(4)/((1.0/int(x.GWAS_n_cases))+(1.0/int(x.GWAS_n_controls))),axis=1)

In [None]:
# calculate GWAS prevalence 
phenosummary_bothsexes_binary['prevalence'] = phenosummary_bothsexes_binary.apply(lambda x: float(x.GWAS_n_cases)/float(x.GWAS_n_nonmiss),axis=1)

In [None]:
phenosummary_bothsexes_binary[phenosummary_bothsexes_binary.id=="1767"]

In [None]:
1-0.01431

In [None]:
# subset to phenotypes with prevalence >=1% in GWAS sample
phenosummary_underPrev = phenosummary_bothsexes_binary[(phenosummary_bothsexes_binary.prevalence<0.01) | (phenosummary_bothsexes_binary.prevalence>0.99)].id.values
phenos_inds_items_bothsexes_prev = phenos_inds_items.drop(np.intersect1d(phenos_inds_items.columns.values,phenosummary_underPrev),axis=1)

In [None]:
phenos_inds_items_bothsexes_prev.shape

In [None]:
def sampprev(x):
    #print(x.name)
    if x.name in phenosummary_bothsexes_binary.id.values:
        if x.sum()==42325:
            return True
        else:
            #print((float(x.value_counts()[1])/x.value_counts().sum()))
            print(x.value_counts())
            print((float(x.value_counts()[1])/x.value_counts().sum()))
            return ((float(x.value_counts()[1])/x.value_counts().sum()))<0.99
    else:
        return True

In [None]:
phenos_inds_items_bothsexes_prev.apply(lambda x: sampprev(x)).sum()

In [None]:
# subset to phenotypes with prevalence >=1% in core sample

def sampprev(x):
    #print(x.name)
    if x.name in phenosummary_bothsexes_binary.id.values:
        if x.sum()==42325:
            return True
        else:
            return ((float(x.value_counts()[1])/x.value_counts().sum()))<0.99
    else:
        return True
    
phenos_inds_items_bothsexes_prev_sampprev = phenos_inds_items_bothsexes_prev.loc[:,phenos_inds_items_bothsexes_prev.apply(lambda x: sampprev(x))]

In [None]:
new = phenos_inds_items_bothsexes_prev_sampprev.columns

In [None]:
len(new)

In [None]:
phenos_inds_items_bothsexes_prev.shape

In [None]:
# subset original phenotype file to "core" individuals
phenos_Qmiss = phenos.loc[inds_Qmiss]

In [None]:
# get null-nonnull matrix
phenos_Qmiss_notnull=phenos_Qmiss.notnull()

In [None]:
# remove items that are mnar or sex-specific
remove_cats = [category_dict[x] for x in nmar_and_sexspecific]
flat_remove_cats = [val for sublist in remove_cats for val in sublist]
phenos_Qmiss_notnull_removed = phenos_Qmiss_notnull.drop(flat_remove_cats,axis=1)

In [None]:
# only keep one column for each cat-mult
phenos_Qmiss_notnull_removed_nodupes = phenos_Qmiss_notnull_removed.loc[:,~phenos_Qmiss_notnull_removed.columns.str.split('_').str[0].duplicated()]

In [None]:
# transpose df to index on items/phenos
phenos_Qmiss_notnull_removed_nodupes_itemindexed = phenos_Qmiss_notnull_removed_nodupes.T

In [None]:
# add Ns to items
phenos_Qmiss_notnull_removed_nodupes_itemindexed['N']=phenos_Qmiss_notnull_removed_nodupes_itemindexed.apply(sum,axis=1)

In [None]:
# add path column to df
phenos_Qmiss_notnull_removed_nodupes_itemindexed['name'] = phenostructure.loc[phenos_Qmiss_notnull_removed_nodupes_itemindexed.index.str.split('_').str[0].values.astype(int)].Field.values

In [None]:
# remove all phenotypes N<30k
phenos_Qmiss_notnull_removed_nodupes_itemindexed_minN = phenos_Qmiss_notnull_removed_nodupes_itemindexed[phenos_Qmiss_notnull_removed_nodupes_itemindexed.N>=30000]

In [None]:
# reintroduce all cat-mult dummies
phenos_inds_items = phenos.loc[inds_Qmiss, list(np.apply_along_axis(np.any,0,np.array([phenos.columns.str.split("_").str[0]==(x.split("_")[0]) for x in phenos_Qmiss_notnull_removed_nodupes_itemindexed_minN.index.values])))]

In [None]:
# load phenosummary file
phenosummary = pd.read_table('/stanley/genetics/analysis/ukbb_sexdiff/family_based/ukb31063_gwas_phesant_icd10_phenotypes_summary.tsv',dtype=object)

In [None]:
# subset to the both-sexes row of all phenotypes
phenosummary_bothsexes = phenosummary[phenosummary.sex=="both"]

In [None]:
# subset to binary phenotypes
phenosummary_bothsexes_binary = phenosummary_bothsexes[~pd.isnull(phenosummary_bothsexes.GWAS_n_cases)]

In [None]:
# calculate Neff 
phenosummary_bothsexes_binary['Neff'] = phenosummary_bothsexes_binary.apply(lambda x: float(4)/((1.0/int(x.GWAS_n_cases))+(1.0/int(x.GWAS_n_controls))),axis=1)

In [None]:
# calculate GWAS prevalence 
phenosummary_bothsexes_binary['prevalence'] = phenosummary_bothsexes_binary.apply(lambda x: float(x.GWAS_n_cases)/float(x.GWAS_n_nonmiss),axis=1)

In [None]:
# "manually" remove phenotypes not applicable to both sexes
phenos_inds_items_bothsexes = phenos_inds_items.loc[:,np.intersect1d(phenos_inds_items.columns.values,phenosummary_bothsexes.id.values)]

In [None]:
phenos_inds_items_bothsexes.shape

In [None]:
# subset to phenotypes with prevalence >=1% in GWAS sample
phenosummary_underPrev = phenosummary_bothsexes_binary[(phenosummary_bothsexes_binary.prevalence<0.01) | (phenosummary_bothsexes_binary.prevalence>0.99)].id.values
phenos_inds_items_bothsexes_prev = phenos_inds_items_bothsexes.drop(np.intersect1d(phenos_inds_items_bothsexes.columns.values,phenosummary_underPrev),axis=1)

In [None]:
phenos_inds_items_bothsexes_prev.shape

In [None]:
# subset to phenotypes with prevalence >=1% in core sample

def sampprev(x):
    if x.name in phenosummary_bothsexes_binary.id.values:
        return ((float(x.value_counts()[0])/x.value_counts().sum()))<0.99
    else:
        return True
    
phenos_inds_items_bothsexes_prev_sampprev = phenos_inds_items_bothsexes_prev.loc[:,phenos_inds_items_bothsexes_prev.apply(lambda x: sampprev(x))]

In [None]:
old = phenos_inds_items_bothsexes_prev_sampprev.columns

In [None]:
np.setdiff1d(new,old)

In [None]:
remove_dependencies = phenos_inds_items_bothsexes_prev_sampprev.copy()

In [None]:
while True:
    corrmat = remove_dependencies.astype(float).corr()
    display(corrmat.shape)
    
    indices = np.where(np.isnan(corrmat))
    
    if len(indices[0])==0:
        break
    
    indices = [(corrmat.index[x], corrmat.columns[y]) for x, y in zip(*indices)]
               #if x != y and x < y]
    tempd = dict()
    for y,x in indices:
        if(y in tempd.keys()):
            tempd[y].append(x)
        #elif(y in list(set(chain(*tempd.values())))):
        #    pass
        else:
            tempd[y] = [x]
            
    keypairs = pd.DataFrame(columns=['values','len','prev','sampprev',"n_case",'std'],index=tempd.keys())
    
    for key in tempd.keys():
        keypairs.loc[key,'values'] = tempd[key]
        keypairs.loc[key,'len'] = len(tempd[key])
        try:
            keypairs.loc[key,'prev'] = phenosummary_bothsexes_binary.set_index('id').loc[str(key)].prevalence
            keypairs.loc[key,'sampprev'] = float(remove_dependencies[key].value_counts()[0])/remove_dependencies[key].value_counts().sum()
            keypairs.loc[key,'n_case'] = remove_dependencies[key].value_counts()[1]
        except:
            keypairs.loc[key,'std'] = np.std(remove_dependencies[key].astype(float))
    
    display(keypairs.sort_values(['len','sampprev','std'],ascending=[False,False,True]))
    
    firstix = keypairs.sort_values(['len','sampprev','std'],ascending=[False,False,True]).index[0]
    
    remove_dependencies.drop(firstix,axis=1, inplace=True)

In [None]:
# get null-nonnull matrix
remove_dependencies_notnull=remove_dependencies.notnull()

In [None]:
np.mean(remove_dependencies_notnull.apply(np.mean,axis=0))

In [None]:
# only keep one column for each cat-mult
remove_dependencies_notnull_nodupes = remove_dependencies_notnull.loc[:,~remove_dependencies_notnull.columns.str.split('_').str[0].duplicated()]

In [None]:
# plot sample-level completeness
plt.hist(remove_dependencies_notnull_nodupes.apply(np.mean,axis=1),bins=50)
plt.show()

In [None]:
# plot item-level N's
plt.hist(remove_dependencies_notnull_nodupes.apply(sum,axis=0),bins=50)
plt.show()

In [None]:
np.linalg.matrix_rank(np.asmatrix(remove_dependencies.astype(float).corr()))

In [None]:
remove_dependencies.to_csv("/psych/genetics_data/ccarey/UKBB/factor_gwas/core_data_group/FA_core.csv")

In [None]:
phenonames = pd.DataFrame(columns=['name'], index=remove_dependencies.columns)
phenonames['name'] = phenosummary_bothsexes.set_index('id').loc[remove_dependencies.columns].name.values

In [None]:
phenonames.to_csv("/psych/genetics_data/ccarey/UKBB/factor_gwas/core_data_group/FA_core_FieldNames.csv")

In [None]:
remove_dependencies.apply(phenosummary_bothsexes[phenosummary_bothsexes.id==x].name.values[0])

In [None]:
for item in remove_dependencies:
    print(item, phenosummary_bothsexes[phenosummary_bothsexes.id==item].name.values[0])

In [None]:
remove_dependencies.astype(float).corr()

In [None]:
len(np.where(np.isinf(corrmat))[0])

In [None]:
remove_dependencies.columns

In [None]:
corrmat = phenos_inds_items_bothsexes_prev_sampprev.astype(float).corr()

In [None]:
corrmat.shape

In [None]:
indices = np.where(np.isnan(corrmat))
indices = [(corrmat.index[x], corrmat.columns[y]) for x, y in zip(*indices)]
           #if x != y and x < y]
tempd = dict()
for y,x in indices:
    if(y in tempd.keys()):
        tempd[y].append(x)
    #elif(y in list(set(chain(*tempd.values())))):
    #    pass
    else:
        tempd[y] = [x]

In [None]:
keypairs = pd.DataFrame(columns=['values','len','prev','sampprev',"n_case"],index=tempd.keys())

In [None]:
for key in tempd.keys():
    keypairs.loc[key,'values'] = tempd[key]
    keypairs.loc[key,'len'] = len(tempd[key])
    try:
        keypairs.loc[key,'prev'] = phenosummary_bothsexes_binary.set_index('id').loc[str(key)].prevalence
        keypairs.loc[key,'sampprev'] = float(phenos_inds_items_bothsexes_prev_sampprev[key].value_counts()[0])/phenos_inds_items_bothsexes_prev_sampprev[key].value_counts().sum()
        keypairs.loc[key,'n_case'] = phenos_inds_items_bothsexes_prev_sampprev[key].value_counts()[1]
    except:
        pass

In [None]:
keypairs.sort_values(['len','sampprev'],ascending=[False,False]).index[0]

In [None]:
unit_completeness_merge_Nmin_Nmax_Nitems_dropped['scot_urban'] = phenos.loc[unit_completeness_merge_Nmin_Nmax_Nitems_dropped.index.values,'20118_11']

In [None]:
unit_completeness_merge_Nmin_Nmax_Nitems_dropped.groupby('scot_urban').sum()
#.groupby('scot_urban').count()

In [None]:
category_dict

In [None]:
phenos['20118_11'].value_counts()

In [None]:
keypairs.sort_values('len',ascending=False)

In [None]:
indices = np.where(np.isnan(corrmat))
indices = pd.DataFrame([(corrmat.index[x], corrmat.columns[y]) for x, y in zip(*indices) if x != y and x < y])

In [None]:
indices

In [None]:
corrmat

In [None]:
phenos_inds_items_bothsexes_prev[['1920', '1930', '1940', '1950', '1960', '1970', '1980', '1990', '2000', '2010', '2020', '2030']]

In [None]:
np.linalg.matrix_rank(np.asmatrix(phenos_inds_items_bothsexes_prev.values.astype(float)))

In [None]:
for item in phenos_inds_items_bothsexes_Prev:
    print(item, phenosummary_bothsexes[phenosummary_bothsexes.id==item].name.values[0][0])

In [None]:
phenos_notnull_nodupes.apply(sum,axis=1)