In [1]:
import requests
import lxml
import html5lib
from bs4 import BeautifulSoup
import pandas as pd
import sqlite3 as sql
from time import sleep
from random import random

### get country names

In [2]:
#generate country list that could be used to perform a join to clean up UN membership
URL = "https://www.un.org/about-us/member-states"
result = requests.get(URL)
src = result.content
soup = BeautifulSoup(src)


#list of UN country names, some people recommend using wikipedia instead? 
countries = soup.find_all(class_="mb-0")

# countries is a list of html elements, h2, that contain country names as txt

In [3]:
ctnames = []
# drill the html in the soup list countries
for x in countries:
    ctnames.append(x.text)

### BITS

In [4]:
#see below notations to understand each line actually means since these 
# two were formed later than the below what is operating here
URL = "https://investmentpolicy.unctad.org/international-investment-agreements/by-economy"
# make the htpps request
result = requests.get(URL)
# removes the meta data and makes a block
src = result.content
# formats the block
soup = BeautifulSoup(src)

In [5]:
table = soup.find_all('table')
economy_df = pd.read_html(str(table))[0]
#print(economy_df)
display(economy_df.head())

Unnamed: 0,No.,Name,* TOTAL BITs,* TOTAL TIPs
0,1,Afghanistan,4 (3 in force),5 (4 in force)
1,2,Albania,45 (40 in force),6 (6 in force)
2,3,Algeria,45 (29 in force),7 (5 in force)
3,4,Andorra,1 (1 in force),0
4,5,Angola,18 (6 in force),6 (5 in force)


In [6]:
enforced = []
total = []
for x in economy_df['* TOTAL BITs']:
    # use str because the elements are strings
    if x == str(0):
        # convert to int
        holder = int(x)
        enforced.append(holder)
        total.append(holder)
    else:
        # split to access relavent data
        holder = x.split(" (")
        #append and int in the same line
        total.append(int(holder[0]))
        #split again to access enforced
        holder = holder[1]
        holder = holder.split()
        #append and int in the same line
        enforced.append(int(holder[0]))
economy_df['enforced'] = enforced
economy_df['total BIT'] = total


In [7]:
# drop unnecessary columns
economy_df = economy_df.drop(columns=['No.','* TOTAL TIPs','* TOTAL BITs'],axis=1)
economy_df

Unnamed: 0,Name,enforced,total BIT
0,Afghanistan,3,4
1,Albania,40,45
2,Algeria,29,45
3,Andorra,1,1
4,Angola,6,18
...,...,...,...
229,Wallis and Futuna Islands,0,0
230,Yemen,22,37
231,Yugoslavia (former),1,1
232,Zambia,8,16


### Country Specific Data

In [8]:
#click on each country name
pages = []
for link in soup.find_all('a'):
    mylink = link.get('href')
    #identifies the portion of the url that has country and country number
    if '/international-investment-agreements/countries/' in mylink:
        mylink = 'https://investmentpolicy.unctad.org' + mylink
        if link.text in ctnames:
            # creating a sleep break that lasts between 1 and 5 seconds
            sleep(random()*4+1)
            # make the request get to access the website
            result = requests.get(mylink)
            # append the result to pages as a tupple
            pages.append((result,link))
# checking tools
#            print(link.text)
#            print(mylink)
#            break

countries where there are no BITS result in a no table error, possible solutions:
- use the previous dataframe as a key where 0 total results in an if else statement
    - could gather all those names from data into a list and then check if link1.text is in that list
- run an if else for na 


In [9]:
#countries where there are no BITS result in a no table error, solved by creating a list of countries to skip (based on total value) by checking the link1.text
skiplist = []
for i in economy_df.index:
    if economy_df['total BIT'][i] == 0:
        skiplist.append(economy_df['Name'][i])

In [10]:
# make the tupple the basis of the loop 
countrydf = []
for page,link1 in pages:
    src2 = page.content
    soup2 = BeautifulSoup(src2)
    # scrape tables, class = table ajax
    tble = soup2.find_all(class_= 'table ajax')
    #make sure it is a un state
    if link1.text in ctnames:
        if link1.text in skiplist:
            #send code to next iteration
            continue
        else:
            #checking tool
#            print([link1.text])
            # convert the soup material to a dataframe
            holder = pd.read_html(str(tble))[0]
            # add column to the dataframe
            holder['country'] = link1.text
            # append to the list
            countrydf.append(holder)    
    else:
        continue

In [11]:
# concat the list
countrycondf = pd.concat(countrydf)
# drop type if not BITs
countrycondf.loc[countrycondf['Type']=='BITs']

Unnamed: 0,No.,Full title,Short title,Type,Status,Parties,Date of signature,Date of entry into force,Termination date,Text,country
0,1,,Afghanistan - Azerbaijan BIT (2017),BITs,Signed (not in force),Azerbaijan,01/12/2017,,,,Afghanistan
1,2,,"Afghanistan - Iran, Islamic Republic of BIT (2...",BITs,In force,"Iran, Islamic Republic of",28/05/2006,02/02/2008,,,Afghanistan
2,3,,Afghanistan - Germany BIT (2005),BITs,In force,Germany,20/04/2005,12/10/2007,,Full text: en,Afghanistan
3,4,,Afghanistan - Turkey BIT (2004),BITs,In force,Türkiye,10/07/2004,19/07/2005,,Full text: en,Afghanistan
0,1,,Albania - United Arab Emirates BIT (2015),BITs,In force,United Arab Emirates,15/10/2015,17/02/2017,,Full text: al,Albania
...,...,...,...,...,...,...,...,...,...,...,...
31,32,,Germany - Zimbabwe BIT (1995),BITs,In force,Germany,29/09/1995,14/04/2000,,Full text: en,Zimbabwe
32,33,,United Kingdom - Zimbabwe BIT (1995),BITs,Signed (not in force),United Kingdom,01/03/1995,,,Full text: en,Zimbabwe
33,34,,Portugal - Zimbabwe BIT (1994),BITs,Signed (not in force),Portugal,05/05/1994,,,Full text: pt,Zimbabwe
34,35,,Malaysia - Zimbabwe BIT (1994),BITs,Signed (not in force),Malaysia,28/04/1994,,,,Zimbabwe


## SQL database

In [12]:
#create sql data bases
conn = sql.connect("BIT.sqlite")
#add both data frames
countrycondf.to_sql(name='countries_table', con = conn)
economy_df.to_sql(name='Bits_table', con = conn)
conn.close()

  sql.to_sql(
