In [1]:
import pandas as pd 

## Accessing remote data by url request

In [4]:
import requests

In [5]:
r = requests.get('https://it.wikipedia.org/wiki/Berlino')
print(r.status_code)
print(r.content)

200
b'<!DOCTYPE html>\n<html class="client-nojs" lang="it" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>Berlino - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":[",\\t.","\xc2\xa0\\t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","gennaio","febbraio","marzo","aprile","maggio","giugno","luglio","agosto","settembre","ottobre","novembre","dicembre"],"wgRequestId":"b82fb140-b45f-4007-9001-3c4b0e6b8983","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Berlino","wgTitle":"Berlino","wgCurRevisionId":136812977,"wgRevisionId":136812977,"wgArticleId":808,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pagine che utilizzano Phonos","Errori del modulo citazione - citazioni con URL nudi","Pagine con collegamenti non funzionanti","Template Webarchive - collegament

### Do something with data

In [6]:
from bs4 import BeautifulSoup

In [7]:
soup = BeautifulSoup(r.content)

In [8]:
print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="it">
 <head>
  <meta charset="utf-8"/>
  <title>
   Berlino - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":[",\t."," \t,"],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","gennaio","febbraio","marzo","aprile","maggio","giugno","luglio","agosto","settembre","ottobre","novembre","dicembre"],"wgRequestId":"b82fb140-b45f-4007-9001-3c4b0e6b8983","wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Berlino","wgTitle":"Berlino","wgCurRevisionId":136812977,"wgRevisionId":136812977,"wgArticleId":808,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Pagine che utilizzano Phonos","Errori del modulo citazione - citazioni con URL nudi","Pagine con collegamenti non funzionanti","Template Webarchive - collegamenti

In [9]:
for a in soup.find_all('a'):
    print(a.get_attribute_list('href'), a.get_text())

[None] 
['#mw-head'] Vai alla navigazione
['#searchInput'] Vai alla ricerca
['/wiki/File:Crystal128-file-broken.svg'] 
['/wiki/Wikipedia:Modello_di_voce/Centro_abitato'] Wikipedia:Modello di voce/Centro abitato
['/wiki/Discussione:Berlino'] discussione
['/wiki/Progetto:Geografia'] progetto di riferimento
['/wiki/Stati_federati_della_Germania'] città-land
['/wiki/File:Coat_of_arms_of_Berlin.svg'] 
['/wiki/File:Flag_of_Berlin.svg'] 
['/wiki/File:Museumsinsel_Berlin_Juli_2021_1_(cropped).jpg'] 
['/wiki/Stato'] Stato
['/wiki/File:Flag_of_Germany.svg'] 
['/wiki/Germania'] Germania
['/wiki/Ministro_presidente'] Ministro presidente
['/wiki/Sindaci_di_Berlino'] Sindaco governatore
['/wiki/Kai_Wegner'] Kai Wegner
['/wiki/Unione_Cristiano-Democratica_di_Germania'] CDU
['#cite_note-1'] [1]
['/wiki/Coordinate_geografiche'] Coordinate
['https://tools.wmflabs.org/geohack/geohack.php?language=it&pagename=Berlino&params=52.516667_N_13.383333_E_type:adm1st_scale:5000000'] 52°31′N 13°23′E﻿ / ﻿52.516667°

## High-level API: An example with geographic data

In [10]:
import geocoder

**Example**: [Italy](https://www.geonames.org/3175395/italian-republic.html), code `3175395`

In [11]:
g = geocoder.geonames(3175395, method='details', key='sednolimodo')
print(g.address, g.wikipedia, g.lat, g.lng)

Italy en.wikipedia.org/wiki/Italy 42.83333 12.83333


# Collect country data and cities data and build a database
We use the [geonames web service](https://www.geonames.org/export/web-services.html)

In [12]:
import requests

**Get country data**

In [13]:
geonames_username = 'sednolimodo' # substitute with your ID

def get_countries():
    url = f'http://api.geonames.org/countryInfoJSON'
    params = {
        'username': geonames_username,
        'style': 'full'  # Include economic features
    }
    response = requests.get(url, params=params)

    if response.status_code == 200:
        country_info = response.json()
        return country_info
    else:
        raise Exception("Connection error {}".format(response.status_code))


In [14]:
countries = get_countries()
print(countries)

{'geonames': [{'continent': 'EU', 'capital': 'Andorra la Vella', 'languages': 'ca', 'geonameId': 3041565, 'south': 42.4287475, 'isoAlpha3': 'AND', 'north': 42.655887500000006, 'fipsCode': 'AN', 'population': '77006', 'east': 1.7866939, 'isoNumeric': '020', 'areaInSqKm': '468.0', 'countryCode': 'AD', 'west': 1.4135734, 'countryName': 'Andorra', 'postalCodeFormat': 'AD###', 'continentName': 'Europe', 'currencyCode': 'EUR'}, {'continent': 'AS', 'capital': 'Abu Dhabi', 'languages': 'ar-AE,fa,en,hi,ur', 'geonameId': 290557, 'south': 22.6315119400001, 'isoAlpha3': 'ARE', 'north': 26.0693916590001, 'fipsCode': 'AE', 'population': '9630959', 'east': 56.381222289, 'isoNumeric': '784', 'areaInSqKm': '82880.0', 'countryCode': 'AE', 'west': 51.5904085340001, 'countryName': 'United Arab Emirates', 'postalCodeFormat': '', 'continentName': 'Asia', 'currencyCode': 'AED'}, {'continent': 'AS', 'capital': 'Kabul', 'languages': 'fa-AF,ps,uz-AF,tk', 'geonameId': 1149361, 'south': 29.3770645357176, 'isoAlph

In [15]:
C = pd.DataFrame(countries['geonames'])
C.population = [int(p) for p in C.population]
C.areaInSqKm = [float(a) for a in C.areaInSqKm]

In [16]:
C.head()

Unnamed: 0,continent,capital,languages,geonameId,south,isoAlpha3,north,fipsCode,population,east,isoNumeric,areaInSqKm,countryCode,west,countryName,postalCodeFormat,continentName,currencyCode
0,EU,Andorra la Vella,ca,3041565,42.428748,AND,42.655888,AN,77006,1.786694,20,468.0,AD,1.413573,Andorra,AD###,Europe,EUR
1,AS,Abu Dhabi,"ar-AE,fa,en,hi,ur",290557,22.631512,ARE,26.069392,AE,9630959,56.381222,784,82880.0,AE,51.590409,United Arab Emirates,,Asia,AED
2,AS,Kabul,"fa-AF,ps,uz-AF,tk",1149361,29.377065,AFG,38.490792,AF,37172386,74.889451,4,647500.0,AF,60.472083,Afghanistan,,Asia,AFN
3,,St John's,en-AG,3576396,16.997853,ATG,17.729483,AC,96286,-61.673634,28,443.0,AG,-61.906387,Antigua and Barbuda,,North America,XCD
4,,The Valley,en-AI,3573511,18.159619,AIA,18.276289,AV,13254,-62.965783,660,102.0,AI,-63.17297,Anguilla,,North America,XCD


**Get cities and city data**

In [17]:
def get_city(north, south, east, west):
    url = f'http://api.geonames.org/citiesJSON'
    params = {
        'north': north,
        'south': south,
        'east': east,
        'west': west,
        'username': geonames_username,
        'style': 'full'  # Include economic features
    }
    response = requests.get(url, params=params)

    if response.status_code == 200:
        city_info = response.json()
        return city_info
    else:
        raise Exception("Connection error {}".format(response.status_code))


In [18]:
coordinates = dict(C.loc[2][['north', 'south', 'east', 'west']])
city = get_city(north=coordinates['north'], south=coordinates['south'], east=coordinates['east'], west=coordinates['west'])

In [19]:
K = pd.DataFrame(city['geonames'])
K.dtypes

lng            float64
geonameId        int64
countrycode     object
name            object
fclName         object
toponymName     object
fcodeName       object
wikipedia       object
lat            float64
fcl             object
population       int64
fcode           object
dtype: object

## Create a database
- For all the cities, we add the country ID to create a foreign key

In [20]:
from tqdm.notebook import tqdm

In [21]:
run = list(C.iterrows())
k_cities = []
for i, c in tqdm(run):
    n, s, e, w = c.north, c.south, c.east, c.west
    cities = get_city(north=n, south=s, east=e, west=w)
    try:
        for k in cities['geonames']:
            if k['countrycode'] == c['countryCode']:
                k['country_id'] = c['geonameId']
                k_cities.append(k)
    except KeyError:
        pass
Kc = pd.DataFrame(k_cities)

  0%|          | 0/250 [00:00<?, ?it/s]

In [22]:
Kc.head(2)

Unnamed: 0,lng,geonameId,countrycode,name,fclName,toponymName,fcodeName,wikipedia,lat,fcl,population,fcode,country_id
0,1.521091,3041563,AD,Andorra la Vella,"city, village,...",Andorra la Vella,capital of a political entity,en.wikipedia.org/wiki/Andorra_la_Vella,42.507793,P,20430.0,PPLC,3041565
1,1.580143,3040686,AD,Encamp,"city, village,...",Encamp,seat of a first-order administrative division,en.wikipedia.org/wiki/Encamp,42.534742,P,11223.0,PPLA,3041565


In [23]:
Kc.shape 

(1531, 13)

## Crate a sqlite database

In [24]:
from sqlalchemy import create_engine, text

In [25]:
engine = create_engine("sqlite:///cities.db")

In [26]:
connection = engine.connect()
C.to_sql(name='country', con=connection, if_exists='replace', index=False)
connection.close()

In [27]:
connection = engine.connect()
Kc.to_sql(name='city', con=connection, if_exists='replace', index=False)
connection.close()

In [28]:
sql = text("SELECT * FROM country")
c = engine.connect()
test = pd.read_sql(sql=sql, con=c)
c.close()

In [29]:
test

Unnamed: 0,continent,capital,languages,geonameId,south,isoAlpha3,north,fipsCode,population,east,isoNumeric,areaInSqKm,countryCode,west,countryName,postalCodeFormat,continentName,currencyCode
0,EU,Andorra la Vella,ca,3041565,42.428748,AND,42.655888,AN,77006,1.786694,020,468.0,AD,1.413573,Andorra,AD###,Europe,EUR
1,AS,Abu Dhabi,"ar-AE,fa,en,hi,ur",290557,22.631512,ARE,26.069392,AE,9630959,56.381222,784,82880.0,AE,51.590409,United Arab Emirates,,Asia,AED
2,AS,Kabul,"fa-AF,ps,uz-AF,tk",1149361,29.377065,AFG,38.490792,AF,37172386,74.889451,004,647500.0,AF,60.472083,Afghanistan,,Asia,AFN
3,,St John's,en-AG,3576396,16.997853,ATG,17.729483,AC,96286,-61.673634,028,443.0,AG,-61.906387,Antigua and Barbuda,,North America,XCD
4,,The Valley,en-AI,3573511,18.159619,AIA,18.276289,AV,13254,-62.965783,660,102.0,AI,-63.172970,Anguilla,,North America,XCD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,AS,Sanaa,ar-YE,69543,12.108116,YEM,18.999633,YM,28498687,54.534397,887,527970.0,YE,42.534005,Yemen,,Asia,YER
246,AF,Mamoudzou,fr-YT,1024031,-13.000090,MYT,-12.650726,MF,279471,45.299985,175,374.0,YT,45.039814,Mayotte,#####,Africa,EUR
247,AF,Pretoria,"zu,xh,af,nso,en-ZA,tn,st,ts,ss,ve,nr",953987,-34.834170,ZAF,-22.125030,SF,57779622,32.944985,710,1219912.0,ZA,16.451890,South Africa,####,Africa,ZAR
248,AF,Lusaka,"en-ZM,bem,loz,lun,lue,ny,toi",895949,-18.077418,ZMB,-8.203284,ZA,17351822,33.709030,894,752614.0,ZM,21.999351,Zambia,#####,Africa,ZMW
