In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Getting the WoS data for Amazon-specific data

In [3]:
# get the new WoS data that was found using keywords "brazilian amazon"
deforestation_df = pd.read_excel('data/Amazon_research/raw_data/savedrecs.xls')

In [4]:
for i in range(1, 12):
    temp_df = pd.read_excel(f'data/Amazon_research/raw_data/savedrecs ({i}).xls')
    deforestation_df = pd.concat([deforestation_df, temp_df])

In [5]:
deforestation_df = deforestation_df.reset_index(drop=True)

In [6]:
len(deforestation_df)

11867

In [7]:
deforestation_df.iloc[2]['Addresses']

nan

In [8]:
deforestation_df.iloc[5]['Article Title']

'Potential land availability for agricultural expansion in the Brazilian Amazon'

In [9]:
deforestation_df.columns

Index(['Publication Type', 'Authors', 'Book Authors', 'Book Editors',
       'Book Group Authors', 'Author Full Names', 'Book Author Full Names',
       'Group Authors', 'Article Title', 'Source Title', 'Book Series Title',
       'Book Series Subtitle', 'Language', 'Document Type', 'Conference Title',
       'Conference Date', 'Conference Location', 'Conference Sponsor',
       'Conference Host', 'Author Keywords', 'Keywords Plus', 'Abstract',
       'Addresses', 'Affiliations', 'Reprint Addresses', 'Email Addresses',
       'Researcher Ids', 'ORCIDs', 'Funding Orgs', 'Funding Text',
       'Cited References', 'Cited Reference Count', 'Times Cited, WoS Core',
       'Times Cited, All Databases', '180 Day Usage Count',
       'Since 2013 Usage Count', 'Publisher', 'Publisher City',
       'Publisher Address', 'ISSN', 'eISSN', 'ISBN', 'Journal Abbreviation',
       'Journal ISO Abbreviation', 'Publication Date', 'Publication Year',
       'Volume', 'Issue', 'Part Number', 'Supplement', 

In [10]:
deforestation_df = deforestation_df.drop(columns=['Unnamed: 69', 'Hot Paper Status', 'Date of Export', 'Pubmed Id', 'Highly Cited Status', 'Special Issue'])

In [11]:
# drop rows without a addresses, publication year and, citations
deforestation_df = deforestation_df.dropna(subset=['Addresses', 'Publication Year', 'Times Cited, All Databases'])

In [12]:
len(deforestation_df)

11453

In [12]:
deforestation_df.to_csv('data/Amazon_research/cleaned_data/Brazil_focused_research.csv')

In [13]:
# reasearch location assignment
country_name_df = pd.read_csv('data/country_names.csv')
country_names = np.array(country_name_df['name'].unique())
country_names = np.append(country_names, ['USA', 'England', 'Ireland', 'Korea', 'Moldova', 'Micronesia', 
                                          'Saint Martin', 'Sint Maarten', 'Tanzania', 'United Kingdom', 'UK',
                                         'United States', 'Virgin Islands'])

country_brazil = np.array(['Brazil'])
country_names_no_brazil = np.setdiff1d(country_names, country_brazil)

deforestation_df = pd.read_csv('data/Amazon_research/cleaned_data/Brazil_focused_research.csv')

domestic_brazil_df = deforestation_df[deforestation_df['Addresses'].str.contains('|'.join(country_names_no_brazil), case=False) == False]
domestic_brazil_df = domestic_brazil_df[domestic_brazil_df['Addresses'].str.contains('Brazil', case=False) == True]
domestic_brazil_df = domestic_brazil_df.reset_index(drop=True)

international_brazil_df = deforestation_df[deforestation_df['Addresses'].str.contains('Brazil', case=False) == False]

# sure that addresses at least contain some country
international_brazil_df = international_brazil_df[international_brazil_df['Addresses'].str.contains('|'.join(country_names_no_brazil), case=False) == True]
international_brazil_df = international_brazil_df.reset_index(drop=True)

  interactivity=interactivity, compiler=compiler, result=result)
  return func(self, *args, **kwargs)


In [14]:
len(domestic_brazil_df)

5218

In [15]:
collaboration_brazil_df = pd.concat([deforestation_df, domestic_brazil_df, international_brazil_df]).drop_duplicates(keep=False)

# make sure that addresses at least contain some country
collaboration_brazil_df = collaboration_brazil_df[collaboration_brazil_df['Addresses'].str.contains('|'.join(country_names), case=False) == True]
collaboration_brazil_df = collaboration_brazil_df.reset_index(drop=True)

In [16]:
international_brazil_df['Publication Year'].min()

1970.0

In [17]:
domestic_brazil_df['Addresses'][1]

'Fiocruz MS, Inst Oswaldo Cruz, Dept Virol, BR-21045900 Rio De Janeiro, Brazil; Cent Publ Hlth Lab, Manaus, Amazonas, Brazil'

In [18]:
len(domestic_brazil_df) + len(international_brazil_df) + len(collaboration_brazil_df)

11384

In [19]:
domestic_brazil_df.to_csv('data/Amazon_research/cleaned_data/domestic_brazil_research.csv')
international_brazil_df.to_csv('data/Amazon_research/cleaned_data/international_brazil_research.csv')
collaboration_brazil_df.to_csv('data/Amazon_research/cleaned_data/collaboration_brazil_research.csv')

Getting the WoS data for Brazil-wide data

In [15]:
deforestation_df = pd.read_excel('data/Brazil_research/raw_data/savedrecs 1_1000.xls')

In [16]:
for i in range(1000, 43000, 1000):
    temp_df = pd.read_excel(f'data/Brazil_research/raw_data/savedrecs {i+1}_{i+1000}.xls')
    deforestation_df = pd.concat([deforestation_df, temp_df])
    
temp_df = pd.read_excel('data/Brazil_research/raw_data/savedrecs 43001_43248.xls')
deforestation_df = pd.concat([deforestation_df, temp_df])

In [17]:
len(deforestation_df)

43248

In [18]:
deforestation_df.head(2)

Unnamed: 0,Publication Type,Authors,Book Authors,Book Editors,Book Group Authors,Author Full Names,Book Author Full Names,Group Authors,Article Title,Source Title,...,WoS Categories,Web of Science Index,Research Areas,IDS Number,Pubmed Id,Open Access Designations,Highly Cited Status,Hot Paper Status,Date of Export,UT (Unique WOS ID)
0,J,"McLean, RC",,,,"McLean, RC",,,Studies in the ecology of tropic al rain-fores...,JOURNAL OF ECOLOGY,...,,,,,,,,,2022-05-05,WOS:000200136600002
1,J,"McLean, RC",,,,"McLean, RC",,,Studies in the ecology of tropical rain-forest...,JOURNAL OF ECOLOGY,...,,,,,,,,,2022-05-05,WOS:000200136600007


In [19]:
deforestation_df.iloc[4]['Addresses']

'Univ Calif Berkeley, Dept Anthropol, Berkeley, CA USA; Fac Filosofia & Letras, Museo Etnograf, Buenos Aires, DF, Argentina; Museo Nacl, Rio De Janeiro, Brazil; Duke Univ, Dept Sociol & Anthropol, Durham, NC USA; Inter Amer Affairs, Washington, DC USA; Rubber Dev Corp, Washington, DC USA; Columbia Broadcasting Systm Televis, New York, NY USA; Escuela Nacl Antropol, Inst Nacl Antropol, Mexico City, DF, Mexico; New Sch Social Res, Ecole Libre Hautes Etudes, New York, NY USA; Off War Informat, Washington, DC USA; Univ Michigan, Museum Anthropol, Ann Arbor, MI USA; Smithsonian Inst, Washington, DC USA; Museu Paraense Emilio Goeldi, Belem, Brazil; Smithsonian Inst, Inst Social Anthropol, Washington, DC USA; Columbia Univ, Dept Anthropol, New York, NY USA'

In [20]:
deforestation_df.iloc[5]['Article Title']

'DISPERSION OF FOREST MOSQUITOES IN BRAZIL - FURTHER STUDIES'

In [21]:
deforestation_df = deforestation_df.drop(columns=['Hot Paper Status', 'Date of Export', 'Pubmed Id', 'Highly Cited Status', 'Special Issue'])

In [22]:
# drop rows without a title, year, authors, locations, citations
# deforestation_df = deforestation_df.dropna(subset=['Authors', 'Addresses', 'Publication Year', 'Article Title', 'Times Cited, All Databases'])
deforestation_df = deforestation_df.dropna(subset=['Addresses', 'Publication Year', 'Times Cited, WoS Core'])

In [23]:
len(deforestation_df)

42070

In [24]:
deforestation_df.to_csv('data/Brazil_research/cleaned_data/Brazil_focused_research.csv')

In [25]:
# reasearch location assignment
country_name_df = pd.read_csv('data/country_names.csv')
country_names = np.array(country_name_df['name'].unique())
country_names = np.append(country_names, ['USA', 'England', 'Ireland', 'Korea', 'Moldova', 'Micronesia', 
                                          'Saint Martin', 'Sint Maarten', 'Tanzania', 'United Kingdom', 'UK',
                                         'United States', 'Virgin Islands'])

country_brazil = np.array(['Brazil'])
country_names_no_brazil = np.setdiff1d(country_names, country_brazil)

deforestation_df = pd.read_csv('data/Brazil_research/cleaned_data/Brazil_focused_research.csv')

domestic_brazil_df = deforestation_df[deforestation_df['Addresses'].str.contains('|'.join(country_names_no_brazil), case=False) == False]
domestic_brazil_df = domestic_brazil_df[domestic_brazil_df['Addresses'].str.contains('Brazil', case=False) == True]
domestic_brazil_df = domestic_brazil_df.reset_index(drop=True)

international_brazil_df = deforestation_df[deforestation_df['Addresses'].str.contains('Brazil', case=False) == False]

# make sure that addresses at least contain some country
international_brazil_df = international_brazil_df[international_brazil_df['Addresses'].str.contains('|'.join(country_names_no_brazil), case=False) == True]
international_brazil_df = international_brazil_df.reset_index(drop=True)

  interactivity=interactivity, compiler=compiler, result=result)
  return func(self, *args, **kwargs)


In [26]:
collaboration_brazil_df = pd.concat([deforestation_df, domestic_brazil_df, international_brazil_df]).drop_duplicates(keep=False)

# make sure that addresses at least contain some country
collaboration_brazil_df = collaboration_brazil_df[collaboration_brazil_df['Addresses'].str.contains('|'.join(country_names), case=False) == True]
collaboration_brazil_df = collaboration_brazil_df.reset_index(drop=True)

In [27]:
collaboration_brazil_df['Publication Year'].min()

1948.0

In [28]:
domestic_brazil_df.to_csv('data/Brazil_research/cleaned_data/domestic_brazil_research.csv')
international_brazil_df.to_csv('data/Brazil_research/cleaned_data/international_brazil_research.csv')
collaboration_brazil_df.to_csv('data/Brazil_research/cleaned_data/collaboration_brazil_research.csv')