In [1]:
import pandas as pd
import numpy as np

In [2]:
# specify the input file name and output file name
habitat_matrix = f'../data/raw_data/larval_habitat.csv'
species_file = f'../data/raw_data/species_data.csv'
habitat_classification_key = f'../data/raw_data/habitat_classification_key.csv'

# reads in the input file as a .csv
habitat_df = pd.read_csv(habitat_matrix)
species_df = pd.read_csv(species_file)

In [3]:
# removes any hidden carriage returns and spaces
def CarriageRemover( pandas_datafram ):
    pandas_datafram = pandas_datafram.replace({r'\r\n': ''}, regex=True)
    pandas_datafram = pandas_datafram.replace({r'\s': ''}, regex=True)
    pandas_datafram = pandas_datafram.replace({r'\n': ''}, regex=True)
    return pandas_datafram

In [4]:
habitat_df = CarriageRemover(habitat_df)
species_df = CarriageRemover(species_df)

In [5]:
# merge both datasets
habitat_species_df = pd.merge(species_df, habitat_df, left_on=['Record_ID'], right_on=['Record_ID'])

In [6]:
# remove columns without genus and species level data
sp_col = ['Pub_ID', 'Subfamily', 'Tribe', 'Subspecies', '"Form"', 'Record_ID', 'Notes_x', 'Notes_y']
habitat_species_df = habitat_species_df.drop(columns=sp_col)

In [7]:
# changes the NaN designation in the 'Subgenus' column to 'None'
#subgenus_column = habitat_species_df['Subgenus'].where(pd.notnull(habitat_species_df['Subgenus']), 'None')
habitat_species_df['Subgenus'] = habitat_species_df['Subgenus'].where(pd.notnull(habitat_species_df['Subgenus']), 'None')

habitat_species_df = habitat_species_df[habitat_species_df['Species'] != 'sp.']
habitat_species_df = habitat_species_df[habitat_species_df['Species'] != 'Sp.']

habitat_species_df = habitat_species_df[habitat_species_df['Presence-Absence'] == 'Presence']

habitat_species_df = habitat_species_df.reset_index(drop=True)

print(habitat_species_df, habitat_species_df.columns)
habitat_species_df['Genus_Species'] = habitat_species_df['Genus'] + '_' + habitat_species_df['Species']

#habitat_species_df.to_csv('../data/intermediate_data/intermed_hab_sp.csv', index=False)

          Genus            Subgenus      Species Presence-Absence  \
0     Anopheles              Cellia      farauti         Presence   
1     Anopheles              Cellia  punctulatus         Presence   
2     Anopheles              Cellia    koliensis         Presence   
3     Anopheles              Cellia       lungae         Presence   
4     Anopheles              Cellia     nataliae         Presence   
...         ...                 ...          ...              ...   
3541      Culex  SubgenusUncertain2      romeroi         Presence   
3542   Mansonia            Mansonia     flaveola         Presence   
3543      Aedes        Ochlerotatus     tortilis         Presence   
3544      Aedes           Stegomyia      aegypti         Presence   
3545      Culex               Culex   bahamensis         Presence   

     Shaded (Shaded/Semishaded/Unshaded) Saltwater (Yes/No)  \
0                                    NaN                NaN   
1                                    NaN     

In [8]:
# map habitat descriptions to abbreviation

habitat_abbv = pd.read_csv(habitat_classification_key)
habitat_abbv = CarriageRemover(habitat_abbv)
habitat_abbv = habitat_abbv.dropna()

habitat_columns = [f'Habitat_{a}' for a in range(1, 51)]

for i in habitat_columns:
    curr_hab = habitat_species_df[i].tolist()
    hab_abbv = []
    #print(curr_hab)
    for j in curr_hab:
        #print(habitat_abbv.loc[habitat_abbv['Unique_Habitat'] == j, 'Habitat_Classifier'].iloc[0])
        try:
            hab_abbv.append(habitat_abbv.loc[habitat_abbv['Unique_Habitat'] == j, 'Habitat_Classifier'].iloc[0])
        except:
            hab_abbv.append('None')

    habitat_species_df[i] = hab_abbv

habitat_species_df = habitat_species_df.dropna(axis=1, how='all')

In [9]:
# make relationship matrix

unique_hab_list = habitat_abbv.Habitat_Classifier.unique().tolist()
habitat_species_relationship = pd.DataFrame(columns=['Genus_Species'] + unique_hab_list)
#print(habitat_species_relationship)
total_habitat_types = len(unique_hab_list)

species_list = []

for i, j in enumerate(habitat_columns):
    curr_hab = habitat_species_df[j].tolist()

    for g, c in enumerate(curr_hab):

        if c == 'None':
            pass
        else:
            prop_spp = habitat_species_df.loc[g, 'Genus_Species']

            if prop_spp not in species_list:
                habs = [0] * total_habitat_types

                habs[unique_hab_list.index(c)] = 1

                habs.insert(0, prop_spp)

                new_row = pd.Series({key: value for key, value in zip(habitat_species_relationship.columns, habs)})
                habitat_species_relationship = pd.concat([habitat_species_relationship, new_row.to_frame().T], ignore_index=True)
                species_list.append(prop_spp)

            else:
                try:
                    indx = habitat_species_relationship.loc[habitat_species_relationship['Genus_Species'] == prop_spp].index[0]
                    #print(indx)
                    habitat_species_relationship.loc[indx, c] += 1
                except:
                    print(prop_spp)
                    print(habitat_species_relationship['Genus_Species'].tolist())


nan
['Anopheles_farauti', 'Anopheles_punctulatus', 'Anopheles_koliensis', 'Anopheles_nataliae', 'Anopheles_solomonis', 'Uranotaenia_painei', 'Uranotaenia_wysockii', 'Uranotaenia_sexaueri', 'Uranotaenia_civinskii', 'Uranotaenia_lateralis', 'Uranotaenia_solomonis', 'Culex_pervigilans', 'Culex_pacificus', 'Culex_iyengari', 'Culex_australicus', 'Culex_atriceps', 'Culex_kesseli', 'Culex_roseni', 'Culex_omani', 'Anopheles_atropos', 'Anopheles_barberi', 'Anopheles_crucians', 'Anopheles_earlei', 'Anopheles_freeborni', 'Anopheles_georgianus', 'Anopheles_occidentalis', 'Anopheles_franciscanus', 'Anopheles_punctipennis', 'Anopheles_walkeri', 'Toxorhynchites_rutilus', 'Wyeomyia_mitchellii', 'Wyeomyia_smithii', 'Wyeomyia_vanduzeei', 'Uranotaenia_anhydor', 'Uranotaenia_lowii', 'Uranotaenia_sapphirina', 'Culiseta_alaskaensis', 'Culiseta_impatiens', 'Culiseta_incidens', 'Culiseta_inornata', 'Culiseta_particeps', 'Culiseta_morsitans', 'Culiseta_melanura', 'Orthopodomyia_alba', 'Orthopodomyia_signifera'

In [10]:
# calculate some summary stats
habitat_species_relationship['SUM'] = np.sum(habitat_species_relationship.drop(columns=['Genus_Species']).values, axis=1)
habitat_species_relationship['UNIQUE_SUM'] = np.count_nonzero(habitat_species_relationship.drop(columns=['Genus_Species', 'SUM']).values, axis=1)
habitat_species_relationship['PROPORTION'] = habitat_species_relationship['UNIQUE_SUM'] / total_habitat_types

habitat_species_relationship = habitat_species_relationship.dropna()
print(habitat_species_relationship)

                   Genus_Species AC ALG ATR BAM BOG BP BRK BWP CEM  ... SWB  \
0              Anopheles_farauti  3   0   0   0   0  0   0   1   0  ...   0   
1          Anopheles_punctulatus  0   0   2   0   0  0   0   0   0  ...   0   
2            Anopheles_koliensis  0   0   1   0   0  0   0   0   0  ...   0   
3             Anopheles_nataliae  0   0   0   0   0  0   0   0   0  ...   0   
4            Anopheles_solomonis  0   0   0   0   0  0   1   0   0  ...   0   
...                          ... ..  ..  ..  ..  .. ..  ..  ..  ..  ...  ..   
1994  Trichoprosopon_vonplesseni  0   0   0   1   0  0   0   0   0  ...   0   
1995    Shannoniana_schedocyclia  0   0   0   1   0  0   0   0   0  ...   0   
1996               Culex_marksae  0   0   0   0   0  0   0   0   0  ...   0   
1997           Aedes_formosensis  0   0   0   1   0  0   0   0   0  ...   0   
1998              Culex_usquatus  1   0   0   0   0  0   0   0   0  ...   0   

     SWG TH TSTM VEG VLC VTR SUM UNIQUE_SUM PROPORT

In [12]:
# output to .csv
habitat_species_relationship.to_csv('../data/cleaned_data/expert_habitat_species_matrix.csv', index=False)