In [21]:
import matplotlib.pyplot as plt

In [1]:
import pandas as pd 

# Accessing remote data by url request

In [None]:
import requests

In [None]:
r = requests.get('https://it.wikipedia.org/wiki/Berlino')
print(r.status_code)
print(r.content)

### Do something with data

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(r.content)

In [None]:
print(soup.prettify())

In [None]:
for a in soup.find_all('a'):
    print(a.get_attribute_list('href'), a.get_text())

## High-level API: An example with geographic data

In [None]:
import geocoder

**Example**: [Italy](https://www.geonames.org/3175395/italian-republic.html), code `3175395`

In [None]:
g = geocoder.geonames(3175395, method='details', key='sednolimodo')
print(g.address, g.wikipedia, g.lat, g.lng)

# Collect country data and cities data and build a database
We use the [geonames web service](https://www.geonames.org/export/web-services.html)

In [2]:
import requests

In [4]:
url = f'http://api.geonames.org/countryInfoJSON'
geonames_username = 'sednolimodo'
params = {
        'username': geonames_username,
        'style': 'full'  # Include economic features
    }
response = requests.get(url, params=params)

In [11]:
for c in response.json()['geonames']:
    print(c)
    break

{'continent': 'EU', 'capital': 'Andorra la Vella', 'languages': 'ca', 'geonameId': 3041565, 'south': 42.4287475, 'isoAlpha3': 'AND', 'north': 42.655887500000006, 'fipsCode': 'AN', 'population': '77006', 'east': 1.7866939, 'isoNumeric': '020', 'areaInSqKm': '468.0', 'countryCode': 'AD', 'west': 1.4135734, 'countryName': 'Andorra', 'postalCodeFormat': 'AD###', 'continentName': 'Europe', 'currencyCode': 'EUR'}


In [12]:
pd.DataFrame(response.json()['geonames'])

Unnamed: 0,continent,capital,languages,geonameId,south,isoAlpha3,north,fipsCode,population,east,isoNumeric,areaInSqKm,countryCode,west,countryName,postalCodeFormat,continentName,currencyCode
0,EU,Andorra la Vella,ca,3041565,42.428748,AND,42.655888,AN,77006,1.786694,020,468.0,AD,1.413573,Andorra,AD###,Europe,EUR
1,AS,Abu Dhabi,"ar-AE,fa,en,hi,ur",290557,22.631512,ARE,26.069392,AE,9630959,56.381222,784,82880.0,AE,51.590409,United Arab Emirates,,Asia,AED
2,AS,Kabul,"fa-AF,ps,uz-AF,tk",1149361,29.377065,AFG,38.490792,AF,37172386,74.889451,004,647500.0,AF,60.472083,Afghanistan,,Asia,AFN
3,,St John's,en-AG,3576396,16.997853,ATG,17.729483,AC,96286,-61.673634,028,443.0,AG,-61.906387,Antigua and Barbuda,,North America,XCD
4,,The Valley,en-AI,3573511,18.159619,AIA,18.276289,AV,13254,-62.965783,660,102.0,AI,-63.172970,Anguilla,,North America,XCD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,AS,Sanaa,ar-YE,69543,12.108116,YEM,18.999633,YM,28498687,54.534397,887,527970.0,YE,42.534005,Yemen,,Asia,YER
246,AF,Mamoudzou,fr-YT,1024031,-13.000090,MYT,-12.650726,MF,279471,45.299985,175,374.0,YT,45.039814,Mayotte,#####,Africa,EUR
247,AF,Pretoria,"zu,xh,af,nso,en-ZA,tn,st,ts,ss,ve,nr",953987,-34.834170,ZAF,-22.125030,SF,57779622,32.944985,710,1219912.0,ZA,16.451890,South Africa,####,Africa,ZAR
248,AF,Lusaka,"en-ZM,bem,loz,lun,lue,ny,toi",895949,-18.077418,ZMB,-8.203284,ZA,17351822,33.709030,894,752614.0,ZM,21.999351,Zambia,#####,Africa,ZMW


**Get country data**

In [13]:
geonames_username = 'sednolimodo' # substitute with your ID

def get_countries():
    url = f'http://api.geonames.org/countryInfoJSON'
    params = {
        'username': geonames_username,
        'style': 'full'  # Include economic features
    }
    response = requests.get(url, params=params)

    if response.status_code == 200:
        country_info = response.json()
        return country_info
    else:
        raise Exception("Connection error {}".format(response.status_code))


In [14]:
countries = get_countries()
print(countries)

{'geonames': [{'continent': 'EU', 'capital': 'Andorra la Vella', 'languages': 'ca', 'geonameId': 3041565, 'south': 42.4287475, 'isoAlpha3': 'AND', 'north': 42.655887500000006, 'fipsCode': 'AN', 'population': '77006', 'east': 1.7866939, 'isoNumeric': '020', 'areaInSqKm': '468.0', 'countryCode': 'AD', 'west': 1.4135734, 'countryName': 'Andorra', 'postalCodeFormat': 'AD###', 'continentName': 'Europe', 'currencyCode': 'EUR'}, {'continent': 'AS', 'capital': 'Abu Dhabi', 'languages': 'ar-AE,fa,en,hi,ur', 'geonameId': 290557, 'south': 22.6315119400001, 'isoAlpha3': 'ARE', 'north': 26.0693916590001, 'fipsCode': 'AE', 'population': '9630959', 'east': 56.381222289, 'isoNumeric': '784', 'areaInSqKm': '82880.0', 'countryCode': 'AE', 'west': 51.5904085340001, 'countryName': 'United Arab Emirates', 'postalCodeFormat': '', 'continentName': 'Asia', 'currencyCode': 'AED'}, {'continent': 'AS', 'capital': 'Kabul', 'languages': 'fa-AF,ps,uz-AF,tk', 'geonameId': 1149361, 'south': 29.3770645357176, 'isoAlph

In [15]:
C = pd.DataFrame(countries['geonames'])
C.population = [int(p) for p in C.population]
C.areaInSqKm = [float(a) for a in C.areaInSqKm]

In [16]:
C.head()

Unnamed: 0,continent,capital,languages,geonameId,south,isoAlpha3,north,fipsCode,population,east,isoNumeric,areaInSqKm,countryCode,west,countryName,postalCodeFormat,continentName,currencyCode
0,EU,Andorra la Vella,ca,3041565,42.428748,AND,42.655888,AN,77006,1.786694,20,468.0,AD,1.413573,Andorra,AD###,Europe,EUR
1,AS,Abu Dhabi,"ar-AE,fa,en,hi,ur",290557,22.631512,ARE,26.069392,AE,9630959,56.381222,784,82880.0,AE,51.590409,United Arab Emirates,,Asia,AED
2,AS,Kabul,"fa-AF,ps,uz-AF,tk",1149361,29.377065,AFG,38.490792,AF,37172386,74.889451,4,647500.0,AF,60.472083,Afghanistan,,Asia,AFN
3,,St John's,en-AG,3576396,16.997853,ATG,17.729483,AC,96286,-61.673634,28,443.0,AG,-61.906387,Antigua and Barbuda,,North America,XCD
4,,The Valley,en-AI,3573511,18.159619,AIA,18.276289,AV,13254,-62.965783,660,102.0,AI,-63.17297,Anguilla,,North America,XCD


**Get cities and city data**

In [17]:
def get_city(north, south, east, west):
    url = f'http://api.geonames.org/citiesJSON'
    params = {
        'north': north,
        'south': south,
        'east': east,
        'west': west,
        'username': geonames_username,
        'style': 'full'  # Include economic features
    }
    response = requests.get(url, params=params)

    if response.status_code == 200:
        city_info = response.json()
        return city_info
    else:
        raise Exception("Connection error {}".format(response.status_code))


In [18]:
C.head(2)

Unnamed: 0,continent,capital,languages,geonameId,south,isoAlpha3,north,fipsCode,population,east,isoNumeric,areaInSqKm,countryCode,west,countryName,postalCodeFormat,continentName,currencyCode
0,EU,Andorra la Vella,ca,3041565,42.428748,AND,42.655888,AN,77006,1.786694,20,468.0,AD,1.413573,Andorra,AD###,Europe,EUR
1,AS,Abu Dhabi,"ar-AE,fa,en,hi,ur",290557,22.631512,ARE,26.069392,AE,9630959,56.381222,784,82880.0,AE,51.590409,United Arab Emirates,,Asia,AED


In [20]:
C.loc[2][['north', 'south', 'east', 'west']]

north    38.490792
south    29.377065
east     74.889451
west     60.472083
Name: 2, dtype: object

In [23]:
coordinates = dict(C.loc[2][['north', 'south', 'east', 'west']])
city = get_city(north=coordinates['north'], south=coordinates['south'], east=coordinates['east'], west=coordinates['west'])

In [24]:
K = pd.DataFrame(city['geonames'])
K.dtypes

lng            float64
geonameId        int64
countrycode     object
name            object
fclName         object
toponymName     object
fcodeName       object
wikipedia       object
lat            float64
fcl             object
population       int64
fcode           object
dtype: object

In [25]:
K

Unnamed: 0,lng,geonameId,countrycode,name,fclName,toponymName,fcodeName,wikipedia,lat,fcl,population,fcode
0,69.172325,1138958,AF,Kabul,"city, village,...",Kabul,capital of a political entity,en.wikipedia.org/wiki/Kabul,34.528126,P,4434550,PPLC
1,73.043289,1176615,PK,Islamabad,"city, village,...",Islamabad,capital of a political entity,en.wikipedia.org/wiki/Islamabad,33.721484,P,601600,PPLC
2,74.350713,1172451,PK,Lahore,"city, village,...",Lahore,seat of a first-order administrative division,en.wikipedia.org/wiki/Lahore,31.557996,P,6310888,PPLA
3,73.089693,1179400,PK,Faisalabad,"city, village,...",Faisalabad,seat of a second-order administrative division,en.wikipedia.org/wiki/Faisalabad,31.415536,P,2506595,PPLA2
4,71.478241,1169825,PK,Multan,"city, village,...",Multan,seat of a second-order administrative division,en.wikipedia.org/wiki/Multan,30.196789,P,1437230,PPLA2
5,74.187052,1177662,PK,Gujranwala,"city, village,...",Gujranwala,seat of a second-order administrative division,en.wikipedia.org/wiki/Gujranwala,32.155667,P,1384471,PPLA2
6,71.578488,1168197,PK,Peshawar,"city, village,...",Peshawar,seat of a first-order administrative division,en.wikipedia.org/wiki/Peshawar,34.008,P,1218773,PPLA
7,67.00141,1167528,PK,Quetta,"city, village,...",Quetta,seat of a first-order administrative division,en.wikipedia.org/wiki/Quetta,30.184138,P,733675,PPLA
8,73.47082,1169607,PK,Muzaffarabad,"city, village,...",Muzaffarābād,seat of a first-order administrative division,en.wikipedia.org/wiki/Muzaffarabad,34.37002,P,725000,PPLA
9,73.023291,1183105,PK,Battagram,"city, village,...",Battagram,seat of a second-order administrative division,en.wikipedia.org/wiki/Battagram,34.677194,P,700000,PPLA2


## Create a database
- For all the cities, we add the country ID to create a foreign key

In [None]:
from tqdm.notebook import tqdm

In [26]:
for i, c in C.iterrows():
    print(i)
    print(c)
    break

0
continent                         EU
capital             Andorra la Vella
languages                         ca
geonameId                    3041565
south                      42.428748
isoAlpha3                        AND
north                      42.655888
fipsCode                          AN
population                     77006
east                        1.786694
isoNumeric                       020
areaInSqKm                     468.0
countryCode                       AD
west                        1.413573
countryName                  Andorra
postalCodeFormat               AD###
continentName                 Europe
currencyCode                     EUR
Name: 0, dtype: object


In [None]:
run = list(C.iterrows())
k_cities = []
for i, c in tqdm(run):
    n, s, e, w = c.north, c.south, c.east, c.west
    cities = get_city(north=n, south=s, east=e, west=w)
    try:
        for k in cities['geonames']:
            if k['countrycode'] == c['countryCode']:
                k['country_id'] = c['geonameId']
                k_cities.append(k)
    except KeyError:
        pass
Kc = pd.DataFrame(k_cities)

In [None]:
Kc.head(2)

In [None]:
Kc.shape 

## Crate a sqlite database

In [None]:
from sqlalchemy import create_engine, text

In [None]:
engine = create_engine("sqlite:///cities.db")

In [None]:
connection = engine.connect()
C.to_sql(name='country', con=connection, if_exists='replace', index=False)
connection.close()

In [None]:
connection = engine.connect()
Kc.to_sql(name='city', con=connection, if_exists='replace', index=False)
connection.close()

In [None]:
sql = text("SELECT * FROM country")
c = engine.connect()
test = pd.read_sql(sql=sql, con=c)
c.close()

In [None]:
test