In [1]:
import pandas as pd
import numpy as np
import requests
from Bio import Entrez
import re
import math 
import warnings
import configparser
import json
import datetime
from functools import reduce
warnings.filterwarnings("ignore")

In [4]:
#keyword:
def get_keywords(json_key):
    len_keyList = len(json_key)
    keyword = []

    if len_keyList > 0:
        for key in json_key[0]:
            keyword.append(key)
        keyword='; '.join(keyword)
    else:
        keyword ='none'
    return keyword

# Function to parse the author list
def get_first_last_email(json_name):    
    firstname =[]
    lastname=[]
    email=[]

    for author in json_name:

        #email
        try:
            firstname.append(author['ForeName'])
            lastname.append(author['LastName'])
            s = author['AffiliationInfo'][0]['Affiliation']
            email_list = re.findall('\S+@\S+', s)
            if len(email_list)>0:
                #email = ''.join(email_list)
                t_email = ''.join(email_list)
                email.append(t_email)
            else:
                email.append('none')
        except IndexError:
            email.append('none')

        except KeyError:
            firstname.append('none')
            lastname.append('none')

    return firstname, lastname, email

def fetch_pubs(year,query_arg):
    # Create output df
    id_df = pd.DataFrame()
    
    print('fetching PubMed data...\n')
    
    # For each year loop generate queries and append IDs to output df
    for i in year:
        query = query_arg+i+'"[DP]'
        print('Query -',query)
        Entrez.email = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
        handle = Entrez.esearch(db='pubmed', 
                                sort='relevance', 
                                retmax='1',
                                retmode='xml', 
                                term=query)
        result = Entrez.read(handle)
    
        id_list = result['IdList']
        id_df = id_df.append(id_list)

    id_df.drop_duplicates(inplace=True) 
    id_list=list(id_df[0])
    
    # Convert IDs to list
    ids = ','.join(id_list)
    
    # Set the Entrez email parameter
    Entrez.email = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    
    handle = Entrez.efetch(db='pubmed',
                           retmode='xml',
                           id=ids)
    results = Entrez.read(handle)
    
    rows = []
    for i, key in enumerate(id_list):
        temp=[]
        temp2=[]
        
        try:
            title = results['PubmedArticle'][i]['MedlineCitation']['Article']['ArticleTitle']  
            jabbrv = results['PubmedArticle'][i]['MedlineCitation']['Article']['Journal']['ISOAbbreviation']
            journal = results['PubmedArticle'][i]['MedlineCitation']['Article']['Journal']['Title']
            pmid = results['PubmedArticle'][i]['MedlineCitation']['PMID'][0:]
            abstract = results['PubmedArticle'][i]['MedlineCitation']['Article']['Abstract']['AbstractText'][0][0:]
            year = results['PubmedArticle'][i]['MedlineCitation']['Article']['ArticleDate'][0]['Year']
            month = results['PubmedArticle'][i]['MedlineCitation']['Article']['ArticleDate'][0]['Month']
            day = results['PubmedArticle'][i]['MedlineCitation']['Article']['ArticleDate'][0]['Day']
            doi = results['PubmedArticle'][i]['MedlineCitation']['Article']['ELocationID'][0][0:]
            add = results['PubmedArticle'][i]['MedlineCitation']['Article']['AuthorList'][0]['AffiliationInfo'][0]['Affiliation']
            address = re.sub('\..*','',add)
            
            keywords = get_keywords(results['PubmedArticle'][i]['MedlineCitation']['KeywordList'])
            
            firstname, lastname, email = get_first_last_email(results['PubmedArticle'][i]['MedlineCitation']['Article']['AuthorList'][0:])
        
        except KeyError:
            abstract = 'none'
            keywords='none'
            
            
        except IndexError:
            title = 'none'
            jabbrv ='none'
            journal = 'none'
            pmid ='none'
            year = 'none'
            month= 'none'
            day= 'none'
            doi='none'
            address='none'
            keywords='none'
            firstname, lastname, email = 'none', 'none', 'none'
        
        temp.extend((pmid,doi,title, abstract, year, month, day, jabbrv, journal,keywords))
        
        temp2.extend((pmid,doi,title, abstract, year, month, day, jabbrv, journal,keywords, lastname, firstname, address, email))
        
        if not lastname=='none':
            for last, first, em in zip(lastname, firstname, email):
                temp1=temp.copy()
                temp1.extend((last, first, address, em))
                rows.append(temp1)
        else:
            rows.append(temp2)
            
        pubmed_df =pd.DataFrame(rows, columns=['pmid', 'doi', 'title', 'abstract', 'year', 'month', 'day','jabbrv', 'journal',
                                               'keywords', 'lastname', 'firstname', 'address', 'email'])
    
    pubmed_df_HCP_List = pubmed_df[['pmid','firstname','lastname']][pubmed_df['pmid']!='none']
    pubmed_df_HCP_List['fullName'] = pubmed_df_HCP_List['firstname']+' '+pubmed_df_HCP_List['lastname']
    pubmed_df_HCP_List['fullName'] = pubmed_df_HCP_List['fullName'].str.strip()
    pubmed_df_HCP_List['name'] = pubmed_df_HCP_List['firstname'].str.replace('( ).*','')+' '+pubmed_df_HCP_List['lastname']
    pubmed_df_HCP_List.drop_duplicates(inplace=True)
    pubmed_df_HCP_List.head(10)
 
    return pubmed_df,pubmed_df_HCP_List


year = ['2021']
# mesh_terms = '(vasomotor symptoms[TIAB] | menopausal symptom[TIAB] | hot flashes[TIAB] | hot flushes[TIAB]) & "'
mesh_terms = '(gene therapy[TIAB]) & "'

pubs=fetch_pubs(year, mesh_terms)
pubs[0].to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\MA\influencer_profiling\influencer_profiling_v1.0\outputs\pubs.csv")
pubs[1].to_csv(r"C:\Users\A4023862\OneDrive - Astellas Pharma Inc\MA\influencer_profiling\influencer_profiling_v1.0\outputs\pub_authors.csv")

fetching PubMed data...

Query - (gene therapy[TIAB]) & "2021"[DP]


In [7]:
pubs[0]

Unnamed: 0,pmid,doi,title,abstract,year,month,day,jabbrv,journal,keywords,lastname,firstname,address,email
0,34629464,10.1038/s41434-021-00299-x,A PCR-amplified transgene fragment flanked by ...,The application of recombinant adeno-associate...,2021,10,11,Gene Ther,Gene therapy,none,Adachi,Kumi,Department of Biochemistry and Molecular Biolo...,none
1,34629464,10.1038/s41434-021-00299-x,A PCR-amplified transgene fragment flanked by ...,The application of recombinant adeno-associate...,2021,10,11,Gene Ther,Gene therapy,none,Tomono,Taro,Department of Biochemistry and Molecular Biolo...,none
2,34629464,10.1038/s41434-021-00299-x,A PCR-amplified transgene fragment flanked by ...,The application of recombinant adeno-associate...,2021,10,11,Gene Ther,Gene therapy,none,Okada,Hironori,Department of Biochemistry and Molecular Biolo...,none
3,34629464,10.1038/s41434-021-00299-x,A PCR-amplified transgene fragment flanked by ...,The application of recombinant adeno-associate...,2021,10,11,Gene Ther,Gene therapy,none,Shiozawa,Yusuke,Department of Biochemistry and Molecular Biolo...,none
4,34629464,10.1038/s41434-021-00299-x,A PCR-amplified transgene fragment flanked by ...,The application of recombinant adeno-associate...,2021,10,11,Gene Ther,Gene therapy,none,Yamamoto,Motoko,Department of Biochemistry and Molecular Biolo...,none
5,34629464,10.1038/s41434-021-00299-x,A PCR-amplified transgene fragment flanked by ...,The application of recombinant adeno-associate...,2021,10,11,Gene Ther,Gene therapy,none,Miyagawa,Yoshitaka,Department of Biochemistry and Molecular Biolo...,yoshitaka-miyagawa@nms.ac.jp.
6,34629464,10.1038/s41434-021-00299-x,A PCR-amplified transgene fragment flanked by ...,The application of recombinant adeno-associate...,2021,10,11,Gene Ther,Gene therapy,none,Okada,Takashi,Department of Biochemistry and Molecular Biolo...,t-okada@ims.u-tokyo.ac.jp.


In [None]:
def fetch_pubmed_npi(arg):
    # arg = pubmed_df_HCP_List.head(500)
    df = arg.copy()
    df.head(5)
    
    df['firstname_only'] = df['firstname'].str.replace('( ).*','')
    df['list'] = df.apply(lambda x: (x['firstname_only'], x['lastname']), axis=1)
    
    df_new = pd.DataFrame()
    
    for x in df['list'].tolist():
        try:
            r = requests.get('https://npiregistry.cms.hhs.gov/api/?'+
                             str('enumeration_type=NPI-1')+'&'+
                             str('version=2.1')+'&'+ #+str(x['version'])+'&'+
                             str('first_name=')+str(x[0])+'&'+
                             str('last_name=')+str(x[1])+'&'+
                             str('skip=')+str('')+'&'+str('limit=')+str(''))
            data=r.json()
            d = pd.DataFrame.from_dict(data['results'])  
            df_new = df_new.append(d)
        except:
            pass
        # d['Npi_Criteria'] = 'FN+LN'
               
    basic_col =['ein', 'organization_name', 'last_name', 'first_name', 'middle_name', 'name_prefix', 'name_suffix', 'credential',
                'last_updated', 'deactivation_reason_code', 'deactivation_date', 'reactivation_date', 'gender',
                'authorized_official_telephone_number']
    
    address_col = ['address_1', 'address_2', 'city', 'state', 'postal_code', 'country_code', 'telephone_number']
    
    taxo_col = ['license', 'state']
    
    d = df_new[['number', 'basic', 'addresses', 'taxonomies']]
    
    basic = d.basic.apply(pd.Series)
    basic_df = pd.DataFrame(columns=basic_col)
    basic_df = pd.concat([basic_df,basic]).fillna('none')
    basic_df = basic_df[basic_col]
    
    d = pd.concat([d,basic_df], axis=1)
    
    d = d.explode('addresses')
    
    addresses = d.addresses.apply(pd.Series)
    
    address_df = pd.DataFrame(columns=address_col)
    address_df = pd.concat([address_df,addresses]).fillna('none')
    address_df = address_df[address_col]
    address_df = address_df.rename(columns={'state':'address_state'})
    
    d = pd.concat([d,address_df], axis=1)
    
    d = d.explode('taxonomies')
    taxonomies = d.taxonomies.apply(pd.Series)
    
    taxo_df = pd.DataFrame(columns=taxo_col)
    taxo_df = pd.concat([taxo_df,taxonomies]).fillna('none')
    taxo_df = taxo_df[taxo_col]
    taxo_df = taxo_df.rename(columns={'state':'taxo_state'})
    
    d = pd.concat([d,taxo_df], axis=1) 
    d = d.drop(['basic', 'addresses','taxonomies'], axis=1)
    d = d.drop_duplicates()
    
    d = d.rename(
    columns={'number':'NPI', 'ein': 'Employer_Identification_Number_(EIN)', 'organization_name':'Provider_Organization_Name_(Legal_Business_Name)',
             'last_name':'Provider_Last_Name_(Legal_Name)', 'first_name': 'Provider_First_Name', 'middle_name':'Provider_Middle_Name',
             'name_prefix': 'Provider_Name_Prefix_Text', 'name_suffix': 'Provider_Name_Suffix_Text',
             'credential': 'Provider_Credential_Text', 'last_updated': 'Last_Update_Date',
             'deactivation_reason_code':'NPI_Deactivation_Reason_Code', 'deactivation_date': 'NPI_Deactivation_Date',
             'reactivation_date': 'NPI_Reactivation_Date', 'gender':'Provider_Gender_Code',
             'authorized_official_telephone_number':'Authorized_Official_Telephone_Number',
             'address_1': 'Provider_First_Line_Business_Mailing_Address', 'address_2': 'Provider_Second_Line_Business_Mailing_Address',
             'city': 'Provider_Business_Mailing_Address_City_Name',
             'address_state': 'Provider_Business_Mailing_Address_State_Name',
             'postal_code' : 'Provider_Business_Mailing_Address_Postal_Code', 'country_code' : 'Provider_Business_Mailing_Address_Country_Code_(If_outside_US)',
             'telephone_number': 'Provider_Business_Mailing_Address_Telephone_Number', 'license': 'Provider_License_Number_1',
             'taxo_state':'Provider_License_Number_State_Code_1'})
    
    npi_df = d.drop_duplicates('NPI', keep='first') 
      
    df['joinkey'] = df['fullName']
    df['joinkey'] = df['joinkey'].str.strip()
    df['joinkey'] = df['joinkey'].str.upper()
    df.head(10)
    
    npi_df['joinkey'] = npi_df['Provider_First_Name']+' '+npi_df['Provider_Middle_Name'].replace('none','-').str[0]+' '+npi_df['Provider_Last_Name_(Legal_Name)']
    npi_df['joinkey'] = npi_df['joinkey'].str.replace(' - ',' ')
    npi_df['joinkey'] = npi_df['joinkey'].str.strip()
    npi_df['joinkey'] = npi_df['joinkey'].str.upper()
    
    pubmed_HCP = df.merge(npi_df,how='left',on='joinkey')
    
    pubmed_HCP_Not_Joined = pubmed_HCP[pubmed_HCP['NPI'].isnull()].filter(df.columns)
    pubmed_HCP = pubmed_HCP[np.logical_not(pubmed_HCP['NPI'].isnull())]
    
    pubmed_HCP['key'] = 1
    duplicates = pubmed_HCP[['joinkey','key']].groupby(['joinkey']).sum()
    pubmed_HCP.drop(['key'],inplace=True,axis=1)
    pubmed_HCP = pubmed_HCP.merge(duplicates,on='joinkey',how='inner')
    
    npi_df['joinkey'] = npi_df['Provider_First_Name']+' '+npi_df['Provider_Last_Name_(Legal_Name)']
    npi_df['joinkey'] = npi_df['joinkey'].str.strip()
    npi_df['joinkey'] = npi_df['joinkey'].str.upper()
    
    
    pubmed_HCP_Not_Joined['joinkey'] = pubmed_HCP_Not_Joined['name']
    pubmed_HCP_Not_Joined['joinkey'] = pubmed_HCP_Not_Joined['joinkey'].str.strip()
    pubmed_HCP_Not_Joined['joinkey'] = pubmed_HCP_Not_Joined['joinkey'].str.upper()
    pubmed_HCP_Not_Joined.head(10)
    
    pubmed_HCP_1 = pubmed_HCP_Not_Joined.merge(npi_df,how='left',on='joinkey')
    
    pubmed_HCP_Not_Joined = pubmed_HCP_1[pubmed_HCP_1['NPI'].isnull()].filter(df.columns)
    pubmed_HCP_1 = pubmed_HCP_1[np.logical_not(pubmed_HCP_1['NPI'].isnull())]
    pubmed_HCP_Not_Joined.head(1)
    
    pubmed_HCP_1['key'] = 1
    duplicates = pubmed_HCP_1[['joinkey','key']].groupby(['joinkey']).sum()
    pubmed_HCP_1.drop(['key'],inplace=True,axis=1)
    pubmed_HCP_1 = pubmed_HCP_1.merge(duplicates,on='joinkey',how='inner')
    
    pubmed_HCP = pubmed_HCP.append(pubmed_HCP_1)
    
    pubmed_HCP = pubmed_HCP.filter(['pmid','firstname','lastname','fullName','name','NPI','key'])
    pubmed_HCP_Not_Joined = pubmed_HCP_Not_Joined.filter(['pmid','firstname','lastname','fullName','name'])
    pubmed_HCP_Not_Joined['key'] = 0
    
    pubmed_HCP = pubmed_HCP.append(pubmed_HCP_Not_Joined)
    pubmed_HCP.reset_index(drop=True,inplace=True)
    del pubmed_HCP_1,pubmed_HCP_Not_Joined,d
    return pubmed_HCP,npi_df

In [None]:
def Clinical_Trials(exp):    
    def de_list(input_field):
        if isinstance(input_field, list):
            if len(input_field) == 0:
                return None
            elif len(input_field) == 1:
                return input_field[0]
            else:
                return '; '.join(input_field)
        else:
            return input_field
        
    extract_fields = [
        "NCTId",
        "BriefSummary",
        "BriefTitle",
        "CentralContactEMail",
        "CentralContactName",
        "CentralContactPhone",
        "CentralContactPhoneExt",
        "CentralContactRole",
        "CollaboratorClass",
        "CollaboratorName",
        "CompletionDate",
        "CompletionDateType",
        "IsFDARegulatedDevice",
        "IsFDARegulatedDrug",
        "LeadSponsorClass",
        "LeadSponsorName",
        "LimitationsAndCaveatsDescription",
        "LocationCity",
        "LocationContactEMail",
        "LocationContactName"]
    
    extract_fields2 = [
        "NCTId",
        "LocationContactPhone",
        "LocationContactRole",
        "LocationContactPhoneExt",
        "LocationCountry",
        "LocationFacility",
        "LocationState",
        "LocationStatus",
        "LocationZip",
        "NCTIdAlias",
        "OfficialTitle",
        "OrgClass",
        "OrgFullName",
        "OrgStudyId",
        "OrgStudyIdDomain",
        "OrgStudyIdLink",
        "OrgStudyIdType",
        "OversightHasDMC",
        ]
    
    extract_fields3 = [
        "NCTId",
        "PatientRegistry",
        "Phase",
        "PointOfContactEMail",
        "PointOfContactOrganization",
        "PointOfContactPhone",
        "PointOfContactPhoneExt",
        "PointOfContactTitle",
        "ReferencePMID",
        "ResponsiblePartyInvestigatorAffiliation",
        "ResponsiblePartyInvestigatorFullName",
        "ResponsiblePartyInvestigatorTitle",
        "OverallOfficialName"
        ]
    
    data=pd.DataFrame()
    data1=pd.DataFrame()
    data2=pd.DataFrame()
    minrnk = 1
    maxrnk = 1000
    # exp = '("Vasomotor Symptoms")OR("Menopausal Symptoms")OR("Hot Flashes")OR("Hot Flushes")OR(Menopause)'
    BASE_URL = 'https://clinicaltrials.gov/api/query/study_fields?expr='+str(exp)+'&min_rnk='+str(minrnk)+'&max_rnk='+str(maxrnk)+'&fmt=json'
    
    query_url = f'{BASE_URL}&fields={",".join(extract_fields)}'
    #print(query_url)
    print('fetching Clinical Trials data...\n')
    print('Query -',exp)
    r = requests.get(query_url)
    r.status_code
    # query_url = f'{BASE_URL}'
    # print(query_url)
    while(r.status_code == 200):
        BASE_URL = 'https://clinicaltrials.gov/api/query/study_fields?expr='+exp+'&min_rnk='+str(minrnk)+'&max_rnk='+str(maxrnk)+'&fmt=json'
        query_url = f'{BASE_URL}&fields={",".join(extract_fields)}'
        query_url2 = f'{BASE_URL}&fields={",".join(extract_fields2)}'
        query_url3 = f'{BASE_URL}&fields={",".join(extract_fields3)}'
        r = requests.get(query_url)   
        r.status_code    
        j = json.loads(r.content)
        # df = pd.DataFrame(j['FullStudiesResponse']['FullStudies'])
        df = pd.DataFrame(j['StudyFieldsResponse']['StudyFields'])   
        for c in df.columns:
            df[c] = df[c].apply(de_list)
        df['CompletionDate'] = pd.to_datetime(df['CompletionDate'])
        df = df.sort_values(by='CompletionDate', ascending=False)    
        data = data.append(df)
        
        r = requests.get(query_url2)   
        r.status_code    
        j = json.loads(r.content)
        # df = pd.DataFrame(j['FullStudiesResponse']['FullStudies'])
        df = pd.DataFrame(j['StudyFieldsResponse']['StudyFields'])   
        for c in df.columns:
            df[c] = df[c].apply(de_list)
           
        data1 = data1.append(df)
        
        r = requests.get(query_url3)   
        r.status_code    
        j = json.loads(r.content)
        # df = pd.DataFrame(j['FullStudiesResponse']['FullStudies'])
        df = pd.DataFrame(j['StudyFieldsResponse']['StudyFields'])   
        for c in df.columns:
            df[c] = df[c].apply(de_list)
         
        data2 = data2.append(df)
        
        minrnk +=1000
        maxrnk +=1000
        #print(minrnk)
        if(j['StudyFieldsResponse']['NStudiesFound'] < minrnk):
            break
    
    final_data = data.merge(data1,on='NCTId').merge(data2,on='NCTId')
    
    del data,data1,data2
    
    final_data_NotNAN_US = final_data[(final_data['LocationCountry'].str.contains('United States') == True)&((final_data['CentralContactName'])+(final_data['LocationContactName'])+(final_data['OverallOfficialName']))].reset_index(drop=True).copy()
    
    #final_data_NotNAN_US.to_csv("C:\\Users\\majichkar\\Desktop\\final_data_NotNAN_US.csv")
    
    
    df_CentralContacts = final_data_NotNAN_US[['NCTId','CentralContactName']].reset_index(drop=True)
    
    df_CentralContacts['CentralContactName'] = df_CentralContacts['CentralContactName'].str.replace(",","/")
    df_CentralContacts = df_CentralContacts.assign(CentralContactName=df_CentralContacts['CentralContactName'].str.split(';')).explode('CentralContactName')
    df_CentralContacts['CentralContactName'] = df_CentralContacts['CentralContactName'].str.replace("(/).*","")
    df_CentralContacts['CentralContactName'] = df_CentralContacts['CentralContactName'].str.strip()
    df_CentralContacts.reset_index(drop=True, inplace = True)
    df_CentralContacts.drop_duplicates(inplace=True)
    
    df_LocationContacts = final_data_NotNAN_US[['NCTId','LocationContactName']].reset_index(drop=True)
    
    df_LocationContacts['LocationContactName'] = df_LocationContacts['LocationContactName'].str.replace(",","/")
    df_LocationContacts = df_LocationContacts.assign(LocationContactName=df_LocationContacts['LocationContactName'].str.split(';')).explode('LocationContactName')
    df_LocationContacts['LocationContactName'] = df_LocationContacts['LocationContactName'].str.replace("(/).*","")
    
    df_LocationContacts['LocationContactName'] = df_LocationContacts['LocationContactName'].str.strip()
    df_LocationContacts.drop_duplicates(inplace=True)
    df_LocationContacts = df_LocationContacts.reset_index(drop=True)
    
    df_OtherInv = final_data_NotNAN_US[['NCTId','ResponsiblePartyInvestigatorFullName','OverallOfficialName']].reset_index(drop=True)
    df_OtherInv = df_OtherInv.fillna('')
    df_OtherInv['OtherInvName'] = df_OtherInv['ResponsiblePartyInvestigatorFullName'].str.cat(df_OtherInv[['OverallOfficialName']].values,sep=';')
    df_OtherInv = df_OtherInv[['NCTId','OtherInvName']]
    
    df_OtherInv['OtherInvName'] = df_OtherInv['OtherInvName'].str.replace(",","/")
    df_OtherInv = df_OtherInv.assign(OtherInvName=df_OtherInv['OtherInvName'].str.split(';')).explode('OtherInvName')
    df_OtherInv['OtherInvName'] = df_OtherInv['OtherInvName'].str.replace("(/).*","")
    df_OtherInv['OtherInvName'] = df_OtherInv['OtherInvName'].str.strip()
    df_OtherInv.drop_duplicates(inplace=True)
    df_OtherInv = df_OtherInv[df_OtherInv['OtherInvName'] != ''].reset_index(drop=True)
    
    df_RefPMID = final_data_NotNAN_US[['NCTId','ReferencePMID']].reset_index(drop=True)
    
    df_RefPMID = df_RefPMID.assign(ReferencePMID=df_RefPMID['ReferencePMID'].str.split(';')).explode('ReferencePMID')
    df_RefPMID['ReferencePMID'] = df_RefPMID['ReferencePMID'].str.strip()
    df_RefPMID.drop_duplicates(inplace=True)
    df_RefPMID = df_RefPMID[np.logical_not(df_RefPMID['ReferencePMID'].isnull())].reset_index(drop=True)
    
    return df_CentralContacts,df_LocationContacts,df_OtherInv,df_RefPMID



def fetch_CT_npi(df_CentralContacts,df_LocationContacts,df_OtherInv):
    df_CentralContacts.rename(columns = {'CentralContactName':'Name'}, inplace = True)
    df_LocationContacts.rename(columns = {'LocationContactName':'Name'}, inplace = True)
    df_OtherInv.rename(columns = {'OtherInvName':'Name'}, inplace = True)
    
    df_CT = df_CentralContacts.append(df_LocationContacts).append(df_OtherInv).reset_index(drop=True)
    
    df_CT['firstname'] = df_CT['Name'].str.split(' ',1).str[0]
    df_CT['lastname'] = df_CT['Name'].str.split(' ').str[-1]
    
    df_CT_no_dup = df_CT[['firstname','lastname']].reset_index(drop=True)
    df_CT_no_dup.drop_duplicates(['firstname','lastname'],inplace=True)

    df = df_CT_no_dup.copy()
    df.head(5)

    df['list'] = df.apply(lambda x: (x['firstname'], x['lastname']), axis=1)
    
    df_new = pd.DataFrame()
    
    for x in df['list'].tolist():
        try:
            r = requests.get('https://npiregistry.cms.hhs.gov/api/?'+
                             str('enumeration_type=NPI-1')+'&'+
                             str('version=2.1')+'&'+ #+str(x['version'])+'&'+
                             str('first_name=')+str(x[0])+'&'+
                             str('last_name=')+str(x[1])+'&'+
                             str('skip=')+str('')+'&'+str('limit=')+str(''))
            data=r.json()
            d = pd.DataFrame.from_dict(data['results'], dtype=np.object)  
            df_new = df_new.append(d)
        except:
            pass
        # d['Npi_Criteria'] = 'FN+LN'
               
    basic_col =['ein', 'organization_name', 'last_name', 'first_name', 'middle_name', 'name_prefix', 'name_suffix', 'credential',
                'last_updated', 'deactivation_reason_code', 'deactivation_date', 'reactivation_date', 'gender',
                'authorized_official_telephone_number']
    
    address_col = ['address_1', 'address_2', 'city', 'state', 'postal_code', 'country_code', 'telephone_number']
    
    taxo_col = ['license', 'state']
    
    d = df_new[['number', 'basic', 'addresses', 'taxonomies']]
    
    basic = d.basic.apply(pd.Series)
    basic_df = pd.DataFrame(columns=basic_col)
    basic_df = pd.concat([basic_df,basic]).fillna('none')
    basic_df = basic_df[basic_col]
    
    d = pd.concat([d,basic_df], axis=1)
    
    d = d.explode('addresses')
    
    addresses = d.addresses.apply(pd.Series)
    
    address_df = pd.DataFrame(columns=address_col)
    address_df = pd.concat([address_df,addresses]).fillna('none')
    address_df = address_df[address_col]
    address_df = address_df.rename(columns={'state':'address_state'})
    
    d = pd.concat([d,address_df], axis=1)
    
    d = d.explode('taxonomies')
    taxonomies = d.taxonomies.apply(pd.Series)
    
    taxo_df = pd.DataFrame(columns=taxo_col)
    taxo_df = pd.concat([taxo_df,taxonomies]).fillna('none')
    taxo_df = taxo_df[taxo_col]
    taxo_df = taxo_df.rename(columns={'state':'taxo_state'})
    
    d = pd.concat([d,taxo_df], axis=1) 
    d = d.drop(['basic', 'addresses','taxonomies'], axis=1)
    d = d.drop_duplicates()
    
    d = d.rename(
    columns={'number':'NPI', 'ein': 'Employer_Identification_Number_(EIN)', 'organization_name':'Provider_Organization_Name_(Legal_Business_Name)',
             'last_name':'Provider_Last_Name_(Legal_Name)', 'first_name': 'Provider_First_Name', 'middle_name':'Provider_Middle_Name',
             'name_prefix': 'Provider_Name_Prefix_Text', 'name_suffix': 'Provider_Name_Suffix_Text',
             'credential': 'Provider_Credential_Text', 'last_updated': 'Last_Update_Date',
             'deactivation_reason_code':'NPI_Deactivation_Reason_Code', 'deactivation_date': 'NPI_Deactivation_Date',
             'reactivation_date': 'NPI_Reactivation_Date', 'gender':'Provider_Gender_Code',
             'authorized_official_telephone_number':'Authorized_Official_Telephone_Number',
             'address_1': 'Provider_First_Line_Business_Mailing_Address', 'address_2': 'Provider_Second_Line_Business_Mailing_Address',
             'city': 'Provider_Business_Mailing_Address_City_Name',
             'address_state': 'Provider_Business_Mailing_Address_State_Name',
             'postal_code' : 'Provider_Business_Mailing_Address_Postal_Code', 'country_code' : 'Provider_Business_Mailing_Address_Country_Code_(If_outside_US)',
             'telephone_number': 'Provider_Business_Mailing_Address_Telephone_Number', 'license': 'Provider_License_Number_1',
             'taxo_state':'Provider_License_Number_State_Code_1'})
    
    npi_df = d.drop_duplicates('NPI', keep='first') 

    df = df_CT.copy()
    df.drop_duplicates(['NCTId','firstname','lastname'],inplace=True)
    df['fullName'] = df['firstname']+' '+df['lastname']
    df['joinkey'] = df['Name']
    df['joinkey'] = df['joinkey'].str.strip()
    df['joinkey'] = df['joinkey'].str.upper()
    df.head(10)
    
    npi_df['joinkey'] = npi_df['Provider_First_Name']+' '+npi_df['Provider_Middle_Name'].replace('none','-').str[0]+' '+npi_df['Provider_Last_Name_(Legal_Name)']
    npi_df['joinkey'] = npi_df['joinkey'].str.replace(' - ',' ')
    npi_df['joinkey'] = npi_df['joinkey'].str.strip()
    npi_df['joinkey'] = npi_df['joinkey'].str.upper()
    
    CT_HCP = df.merge(npi_df,how='left',on='joinkey')
    
    CT_HCP_Not_Joined = CT_HCP[CT_HCP['NPI'].isnull()].filter(df.columns)
    CT_HCP = CT_HCP[np.logical_not(CT_HCP['NPI'].isnull())]
    
    CT_HCP['key'] = 1
    duplicates = CT_HCP[['joinkey','key']].groupby(['joinkey']).sum()
    CT_HCP.drop(['key'],inplace=True,axis=1)
    CT_HCP = CT_HCP.merge(duplicates,on='joinkey',how='inner')
    
    npi_df['joinkey'] = npi_df['Provider_First_Name']+' '+npi_df['Provider_Last_Name_(Legal_Name)']
    npi_df['joinkey'] = npi_df['joinkey'].str.strip()
    npi_df['joinkey'] = npi_df['joinkey'].str.upper()
    
    
    CT_HCP_Not_Joined['joinkey'] = CT_HCP_Not_Joined['fullName']
    CT_HCP_Not_Joined['joinkey'] = CT_HCP_Not_Joined['joinkey'].str.strip()
    CT_HCP_Not_Joined['joinkey'] = CT_HCP_Not_Joined['joinkey'].str.upper()
    CT_HCP_Not_Joined.head(10)
    
    CT_HCP_1 = CT_HCP_Not_Joined.merge(npi_df,how='left',on='joinkey')
    
    CT_HCP_Not_Joined = CT_HCP_1[CT_HCP_1['NPI'].isnull()].filter(df.columns)
    CT_HCP_1 = CT_HCP_1[np.logical_not(CT_HCP_1['NPI'].isnull())]
    CT_HCP_Not_Joined.head(1)
    
    CT_HCP_1['key'] = 1
    duplicates = CT_HCP_1[['joinkey','key']].groupby(['joinkey']).sum()
    CT_HCP_1.drop(['key'],inplace=True,axis=1)
    CT_HCP_1 = CT_HCP_1.merge(duplicates,on='joinkey',how='inner')
    
    CT_HCP = CT_HCP.append(CT_HCP_1)
    
    CT_HCP = CT_HCP.filter(['NCTId','firstname','lastname','fullName','Name','NPI','key'])
    CT_HCP_Not_Joined = CT_HCP_Not_Joined.filter(['NCTId','firstname','lastname','fullName','Name'])
    CT_HCP_Not_Joined['key'] = 0
    
    CT_HCP = CT_HCP.append(CT_HCP_Not_Joined)
    CT_HCP.reset_index(drop=True,inplace=True)
    del CT_HCP_1,CT_HCP_Not_Joined,d
    
    return CT_HCP,npi_df,df_CT

In [None]:
def parse_properties_file(file):
    config = configparser.RawConfigParser()
    config.read(file)
    return config


def initialize_vars(file,op_dir):      
    config = parse_properties_file(file)
    
    y = config['Publication Filters']['years'].strip()
    q = config['Publication Filters']['search Terms'].strip()
    e = config['Clinical Filters']['search Terms'].strip()
    
    path = op_dir + '\\'    
    # y = input("Enter Year(Seperated by comma) - ") 
    # q = input("Enter Keywords for PubMed data(Seperated by comma) - ")
    # e = input("Enter Keywords for Clinical Trials data(Seperated by comma) - ")    
    year = y.split(',')    
    q1 = q.replace(',','[TIAB] | ')
    query_arg = '('+q1+'[TIAB]) & "'    
    e1 = e.replace(',','")OR("')
    exp = '("'+e1+'")'
    # year = ['2021']
    # query_arg = '(vasomotor symptoms[TIAB] | menopausal symptom[TIAB] | hot flashes[TIAB] | hot flushes[TIAB]) & "'
    # exp = '("Vasomotor Symptoms")OR("Menopausal Symptoms")OR("Hot Flashes")OR("Hot Flushes")OR("Menopause")'
    print('Initialized Variables')
    return year,query_arg,exp,path


def process_PM_CT(year,query_arg,exp,path):
    #Fetching Data
    pubmed_df,pubmed_df_HCP_List = pubmed(year,query_arg)
    #pubmed_HCP,pubmed_npi_df = fetch_pubmed_npi(pubmed_df_HCP_List)
    
    pubmed_df_HCP_List1 = pubmed_df_HCP_List.copy()
    pubmed_df_HCP_List1.columns = pubmed_df_HCP_List1.columns.str.upper()
    
    pubmed_df_HCP_List1.to_csv(path+"intermediate/pubmed_hcp_list.csv")
    #pubmed_HCP.to_csv("Pubmed_HCP.csv")
    
    pubmed_df1 = pubmed_df.copy()
    pubmed_df1.columns = pubmed_df1.columns.str.upper()
    
    pubmed_df.to_csv(path+"intermediate/pubmed_data.csv")
    print("\n")
    #Fetching Data
    df_CentralContacts,df_LocationContacts,df_OtherInv,df_RefPMID = Clinical_Trials(exp)
    
    df_CentralContacts.rename(columns = {'CentralContactName':'fullname'}, inplace = True)
    df_LocationContacts.rename(columns = {'LocationContactName':'fullname'}, inplace = True)
    df_OtherInv.rename(columns = {'OtherInvName':'fullname'}, inplace = True)
    
    df_CT = df_CentralContacts.append(df_LocationContacts).append(df_OtherInv).reset_index(drop=True)
    print("\n")
    df_CT['firstname'] = df_CT['fullname'].str.split(' ',1).str[0]
    df_CT['lastname'] = df_CT['fullname'].str.split(' ').str[-1]
    df_CT['name'] = df_CT['firstname'] +' '+ df_CT['lastname']
    
    df_CT['name'] = df_CT['name'].str.upper()
    df_CT['name'] = df_CT['name'].str.strip()
    
    df_CT.drop_duplicates(['NCTId','name'],inplace=True)
    
    # CT_HCP,CT_npi_df,df_CT = fetch_CT_npi(df_CentralContacts,df_LocationContacts,df_OtherInv)
    # CT_HCP.to_csv("CT_HCP.csv")
    
    CT_HCP_List = df_CT.copy()
    
    CT_HCP_List.columns = CT_HCP_List.columns.str.upper()
    
    CT_HCP_List.to_csv(path+"intermediate/clinicaltrials_hcp_list.csv")
    
    #Master_npi = pubmed_npi_df.append(CT_npi_df)
    
    pubmed_df_HCP_List.rename(columns = {'fullName':'fullname'}, inplace = True)
    CT = df_CT.filter(['NCTId','firstname','lastname','fullname','name']).drop_duplicates()
    
    pubmed_df_HCP_List['name'] = pubmed_df_HCP_List['name'].str.upper()
    
    PM_CT_data = pubmed_df_HCP_List.merge(CT,how='outer',on='name',suffixes=('_PM', '_CT'))
    
    PM_CT_agg = PM_CT_data[['pmid','NCTId','name','fullname_PM','fullname_CT']].groupby('name').agg({"pmid": pd.Series.nunique,"NCTId": pd.Series.nunique,"fullname_PM":np.max,"fullname_CT":np.max})
    
    PM_CT_agg.reset_index(inplace=True)
    PM_CT_agg['fullname_PM'] = PM_CT_agg['fullname_PM'].str.upper()
    PM_CT_agg['fullname_CT'] = PM_CT_agg['fullname_CT'].str.upper()
    
    PM_CT_agg_Final = PM_CT_agg.copy()
    PM_CT_agg.rename(columns = {'name':'HCP_Name','pmid':'Count of Publications','NCTId':'Count of Clinical Trials','fullname_PM':'Name(PubMed)','fullname_CT':'Name(ClinicalTrials)'}, inplace = True)
    
    # PM_CT_agg.to_csv("PM_CT_agg.csv")
    
    print("fetched ",len(PM_CT_agg_Final), " HCPs")
    
    #PM_CT_agg_Final.drop(columns = ['fullname_PM','fullname_CT'],inplace=True)
    return PM_CT_agg_Final, CT_HCP_List, pubmed_df

def fetch_npi(Final_df):
    # Final_df = PM_CT_CMS_Final.head(500)
    Final_df['firstname'] = Final_df['name'].str.split(' ',1).str[0]
    Final_df['lastname'] = Final_df['name'].str.split(' ').str[-1]
    
    df = Final_df.copy()
    df['list'] = df.apply(lambda x: (x['firstname'], x['lastname']), axis=1)
    
    df_new = pd.DataFrame()
    
    for x in df['list'].tolist():
        try:
            r = requests.get('https://npiregistry.cms.hhs.gov/api/?'+
                             str('enumeration_type=NPI-1')+'&'+
                             str('version=2.1')+'&'+ #+str(x['version'])+'&'+
                             str('first_name=')+str(x[0])+'&'+
                             str('last_name=')+str(x[1])+'&'+
                             str('skip=')+str('')+'&'+str('limit=')+str(''))
            data=r.json()
            d = pd.DataFrame.from_dict(data['results'], dtype = np.object)  
            df_new = df_new.append(d)
        except:
            pass
        # d['Npi_Criteria'] = 'FN+LN'
               
    basic_col =['ein', 'organization_name', 'last_name', 'first_name', 'middle_name', 'name_prefix', 'name_suffix', 'credential',
                'last_updated', 'deactivation_reason_code', 'deactivation_date', 'reactivation_date', 'gender',
                'authorized_official_telephone_number']
    
    address_col = ['address_1', 'address_2', 'city', 'state', 'postal_code', 'country_code', 'telephone_number']
    
    taxo_col = ['license', 'state']
    
    d = df_new[['number', 'basic', 'addresses', 'taxonomies']]
    
    basic = d.basic.apply(pd.Series)
    basic_df = pd.DataFrame(columns=basic_col)
    basic_df = pd.concat([basic_df,basic]).fillna('none')
    basic_df = basic_df[basic_col]
    
    d = pd.concat([d,basic_df], axis=1)
    
    d = d.explode('addresses')
    
    addresses = d.addresses.apply(pd.Series)
    
    address_df = pd.DataFrame(columns=address_col)
    address_df = pd.concat([address_df,addresses]).fillna('none')
    address_df = address_df[address_col]
    address_df = address_df.rename(columns={'state':'address_state'})
    
    d = pd.concat([d,address_df], axis=1)
    
    d = d.explode('taxonomies')
    taxonomies = d.taxonomies.apply(pd.Series)
    
    taxo_df = pd.DataFrame(columns=taxo_col)
    taxo_df = pd.concat([taxo_df,taxonomies]).fillna('none')
    taxo_df = taxo_df[taxo_col]
    taxo_df = taxo_df.rename(columns={'state':'taxo_state'})
    
    d = pd.concat([d,taxo_df], axis=1) 
    d = d.drop(['basic', 'addresses','taxonomies'], axis=1)
    d = d.drop_duplicates()
    
    d = d.rename(
    columns={'number':'NPI', 'ein': 'Employer_Identification_Number_(EIN)', 'organization_name':'Provider_Organization_Name_(Legal_Business_Name)',
             'last_name':'Provider_Last_Name_(Legal_Name)', 'first_name': 'Provider_First_Name', 'middle_name':'Provider_Middle_Name',
             'name_prefix': 'Provider_Name_Prefix_Text', 'name_suffix': 'Provider_Name_Suffix_Text',
             'credential': 'Provider_Credential_Text', 'last_updated': 'Last_Update_Date',
             'deactivation_reason_code':'NPI_Deactivation_Reason_Code', 'deactivation_date': 'NPI_Deactivation_Date',
             'reactivation_date': 'NPI_Reactivation_Date', 'gender':'Provider_Gender_Code',
             'authorized_official_telephone_number':'Authorized_Official_Telephone_Number',
             'address_1': 'Provider_First_Line_Business_Mailing_Address', 'address_2': 'Provider_Second_Line_Business_Mailing_Address',
             'city': 'Provider_Business_Mailing_Address_City_Name',
             'address_state': 'Provider_Business_Mailing_Address_State_Name',
             'postal_code' : 'Provider_Business_Mailing_Address_Postal_Code', 'country_code' : 'Provider_Business_Mailing_Address_Country_Code_(If_outside_US)',
             'telephone_number': 'Provider_Business_Mailing_Address_Telephone_Number', 'license': 'Provider_License_Number_1',
             'taxo_state':'Provider_License_Number_State_Code_1'})
    
    npi_df = d.drop_duplicates('NPI', keep='first') 
      
    df['joinkey'] = df['name']
    df['joinkey'] = df['joinkey'].str.strip()
    df['joinkey'] = df['joinkey'].str.upper()
    df.head(10)
    
    npi_df['joinkey'] = npi_df['Provider_First_Name']+' '+npi_df['Provider_Last_Name_(Legal_Name)']
    npi_df['joinkey'] = npi_df['joinkey'].str.replace(' - ',' ')
    npi_df['joinkey'] = npi_df['joinkey'].str.strip()
    npi_df['joinkey'] = npi_df['joinkey'].str.upper()
    
    HCP = df.merge(npi_df,how='left',on='joinkey')
    
    HCP_Not_Joined = HCP[HCP['NPI'].isnull()].filter(df.columns)
    HCP = HCP[np.logical_not(HCP['NPI'].isnull())]
    
    HCP['key'] = 1
    duplicates = HCP[['joinkey','key']].groupby(['joinkey']).sum()
    HCP.drop(['key'],inplace=True,axis=1)
    HCP = HCP.merge(duplicates,on='joinkey',how='inner')

    HCP_Not_Joined['key'] = 0        
    HCP = HCP.append(HCP_Not_Joined)
    
    HCP = HCP.filter(['name','NPI','key'])
    HCP.reset_index(drop=True,inplace=True)
    return HCP,npi_df


def final_process(path,PM_CT_agg_Final):
    HCP = pd.DataFrame()
    npi_df = pd.DataFrame()
    print("\n")
    for i in range(1,math.ceil(len(PM_CT_agg_Final)/100)+1):
        print(i,' fetching Npi for HCPs ',i*100-99,' to ',i*100)
        df1,df2 = fetch_npi(pd.DataFrame(PM_CT_agg_Final.loc[i*100-100:i*100,]))
        HCP = HCP.append(df1)
        npi_df = npi_df.append(df2)
        HCP.drop_duplicates(inplace=True)
        npi_df.drop_duplicates(inplace=True)
    
    HCP_agg = HCP[['name','NPI']].groupby('name').agg({"NPI": pd.Series.nunique})
    
    HCP_agg.reset_index(inplace=True)
    HCP_List = PM_CT_agg_Final.merge(HCP_agg,on='name',how='left')
    
    HCP_List = HCP_List.filter(['name', 'pmid', 'NCTId', 'NPI', 'fullname_PM', 'fullname_CT'])
    
    HCP_List['NPI'] = HCP_List['NPI'].fillna(0)
    
    HCP_List.rename(columns = {'name':'HCP_Name','pmid':'Count of Publications','NCTId':'Count of Clinical Trials','NPI':'Count of NPIs','fullname_PM':'Name(PubMed)','fullname_CT':'Name(ClinicalTrials)'}, inplace = True)
    HCP.rename(columns = {'name':'HCP_Name','key':'Count of NPIs'}, inplace = True)
    
    HCP.to_csv(path+"intermediate/hcptonpi.csv")
    HCP_List.to_csv(path+"intermediate/hcp_list.csv")
    npi_df.to_csv(path+"intermediate/npi_df.csv")
    print("\n")
    print('Done Processing')
    
    return npi_df