### Get all the cities that have an airport and create airport_city_dict

In [1]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
file_path = 'airports.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the data
cities = set(data['City'])
print(len(cities))

# Create dictionary with IATA as key and City as value
airport_city_dict = data.set_index('IATA')['City'].to_dict()

# Display the dictionary
print(len(airport_city_dict.keys()))

6956
6073


In [2]:
filter = data[data['City'] == "Petersburg"]
print(filter)

                                    Name        City        Country IATA  \
3919  Petersburg James A Johnson Airport  Petersburg  United States  PSG   
6470            Dinwiddie County Airport  Petersburg  United States  PTB   

      ICAO   Latitude   Longitude  Altitude Timezone DST         Timezone.1  
3919  PAPG  56.801701 -132.945007       111       -9   A  America/Anchorage  
6470  KPTB  37.183800  -77.507401       193       -5   A   America/New_York  


### Get all the connections, and create a graph

In [3]:
# Replace 'your_file.csv' with the path to your CSV file
file_path = 'routes.csv'
routes = pd.read_csv(file_path)

# Filter rows where stops == 0
direct_flights = routes[routes['Stops'] == 0]

# just making sure that we are extracting the right data
source_airports_set = set(direct_flights['Source Airport'])
dest_airports_set = set(direct_flights['Destination Airport'])
airports_set = set(airport_city_dict.keys())
combined_airports = dest_airports_set.intersection(airports_set)
print(len(combined_airports))
print(combined_airports)

3255
{'BWT', 'BUN', 'NRA', 'SJL', 'STI', 'MCT', 'RPR', 'NVK', 'IAN', 'ADD', 'LBB', 'MBA', 'TRR', 'ANF', 'TAY', 'SAT', 'NKC', 'LPS', 'XCH', 'GBT', 'TFS', 'FNC', 'WNR', 'PSE', 'HYD', 'JOI', 'DIU', 'IAD', 'BFD', 'GSO', 'SFL', 'DOH', 'CEN', 'KLO', 'BSR', 'CPV', 'PNP', 'JOL', 'SUB', 'AXR', 'LPM', 'ALS', 'BIM', 'HAA', 'PGU', 'AKN', 'DEN', 'ANG', 'EWR', 'GFF', 'KGT', 'GDV', 'YIO', 'BEB', 'NYK', 'GRQ', 'SMF', 'AAR', 'CMX', 'OAG', 'GYS', 'ILD', 'GHT', 'MIA', 'RMT', 'PLQ', 'VLS', 'IQT', 'TTT', 'IFJ', 'RMI', 'EMA', 'AGU', 'SGC', 'KLR', 'KRN', 'EJH', 'SLM', 'AAX', 'SJP', 'CLV', 'LHR', 'HRK', 'HOE', 'CAG', 'AMM', 'NWI', 'TSN', 'BEG', 'GIL', 'SFJ', 'YCY', 'DAV', 'HKK', 'LCK', 'GRP', 'DYG', 'WRO', 'SAW', 'MQH', 'FAV', 'ATL', 'RLK', 'CGO', 'ROI', 'LZC', 'CYX', 'BDJ', 'MID', 'CRM', 'MOF', 'ERL', 'MXL', 'KME', 'YTZ', 'MQT', 'YUS', 'EGX', 'MRU', 'KUT', 'LRH', 'PEW', 'MGZ', 'KRL', 'GLA', 'TMJ', 'XTG', 'GCM', 'AAY', 'TNA', 'TPS', 'VPS', 'GRK', 'PSA', 'DVO', 'RAI', 'PLS', 'LMM', 'NAJ', 'LSC', 'MCK', 'AAN', 

In [4]:
# create a graph with attributes such as airline id and airline
import networkx as nx

G = nx.DiGraph()

for index, row in direct_flights.iterrows():
   # get airport code
   source_airport = row['Source Airport']
   destination_airport = row['Destination Airport']
   airline = row['Airline']
   airline_id = row['Airline ID']
   
   # Check if the airport codes exist in the dictionary
   if source_airport in airport_city_dict and destination_airport in airport_city_dict:
      # Get city names based on airport codes
      source_city = airport_city_dict[source_airport]
      destination_city = airport_city_dict[destination_airport]
      
      # Add an edge from Source City to Destination City with airline as an attribute
      G.add_edge(source_city, destination_city, Airline=airline, Airline_ID=airline_id)

### Saving and loading to pickle to make sure that is is ok 

In [5]:
# save to pickle
import pickle

# Save the graph to a file using pickle
with open('flight_network_graph.pickle', 'wb') as f:
   pickle.dump(G, f)

In [6]:
# Load the graph from the pickle file
with open('flight_network_graph.pickle', 'rb') as f:
   G_loaded = pickle.load(f)
print(len(G_loaded.edges()))

34614


### Scrape the info on cities

In [16]:
import urllib
import json

def get_wikivoyage_page(name):

   baseurl = "https://en.wikipedia.org/w/api.php?"
   action = "action=query"
   urltitle = f"titles={name}"
   content = "prop=revisions&rvprop=content&rvslots=main"
   dataformat ="format=json"

   query = f"{baseurl}{action}&{urltitle}&{content}&{dataformat}"
   wikiresponse = urllib.request.urlopen(query)
   wikidata = wikiresponse.read()
   wikitext = wikidata.decode('utf-8')
   data = json.loads(wikitext)
   return data

def get_wikitext_content(data):
   if isinstance(data, dict):
      for key, value in data.items():
         if key == '*':
               return value
         result = get_wikitext_content(value)
         if result:
               return result
   elif isinstance(data, list):
      for item in data:
         result = get_wikitext_content(item)
         if result:
               return result
   return None

In [74]:
import re

node = "Legazpi, Albay"
city = str(node).replace(" ", "_")
city = urllib.parse.quote(city)

data = get_wikivoyage_page(city)
content = get_wikitext_content(data)
# print(content)
real_name = city
if "#REDIRECT" in content.upper():
   print("bio je redirect")
   matches = re.findall(r'\[\[(.*?)\]\]', content)
   real_name = matches[0]

   real_name = str(real_name).replace(" ", "_")
   print("real name:", real_name)
   real_name = urllib.parse.quote(real_name)
data = get_wikivoyage_page(real_name)
content = get_wikitext_content(data)
   
matches = re.findall(r'\[\[(.*?)\]\](?=.*city)', content)
last_match_before_city = matches[-1] if matches else None
print("last match", last_match_before_city)

print(type(content))
print(len(content))
print(content)



last match 1936 Summer Olympics
<class 'str'>
127763
{{Short description|Capital and largest city of Albay, Philippines}}
{{Use mdy dates|date=July 2022}}
{{Infobox settlement
| name                     = {{PH wikidata|name}}
| image_skyline            = Legazpi City Montage.jpg
| image_size                = 250px
| image_caption            = Clockwise from top right: JCI Legazpi Tourism Marker, View from The Oriental Legazpi, [[Legazpi Cathedral|Cathedral of St. Gregory the Great]], Battle of Legazpi Monument, Legazpi City Hall, Zip-line at [[Ligñon Hill]], Old Legazpi Airport
| image_flag               = Flag_of_Legazpi,_Albay.png
| flag_size                = 120x80px
| image_seal               = Ph seal legazpicity.png
| seal_size                = 100x80px
| image_map                = {{PH wikidata|image_map}}
| map_caption              = {{PH wikidata|map_caption}}
| image_map1               = {{hidden begin|title=OpenStreetMap|ta1=center}}{{Infobox mapframe|frame-width=250}}{{hidd

In [None]:
import re

node = "Chalkyitsik"
city = str(node).replace(" ", "_")
city = urllib.parse.quote(city)

data = get_wikivoyage_page(city)
content = get_wikitext_content(data)
print(content)
real_name = ""
if "#REDIRECT" in content:
   matches = re.findall(r'\[\[(.*?)\]\]', content)
   real_name = matches[0]

real_name = str(real_name).replace(" ", "_")
real_name = urllib.parse.quote(real_name)

data = get_wikivoyage_page(real_name)
print(data)
content = get_wikitext_content(data)
   
print(type(content))
print(len(content))
print(content)


#redirect [[Chalkyitsik, Alaska]]
{'batchcomplete': ''}
<class 'NoneType'>


TypeError: object of type 'NoneType' has no len()

In [9]:
import urllib.parse

# URL-encode the city name
city_name = 'Chita'
encoded_city_name = urllib.parse.quote(city_name)
print(encoded_city_name)

# Use the encoded city name in the API request
data = get_wikivoyage_page(encoded_city_name) 
content = get_wikitext_content(data)
print(len(content))

Chita
1576


In [43]:
print(G.nodes())

['Sochi', 'Kazan', 'Astrakhan', 'Mineralnye Vody', 'Chelyabinsk', 'Novosibirsk', 'Moscow', 'Nizhnekamsk', 'Taganrog', 'Bugulma', 'Belgorod', 'Kaliningrad', 'Baku', 'St. Petersburg', 'Yekaterinburg', 'Nizhnevartovsk', 'Novy Urengoy', 'Bratsk', 'Irkutsk', 'Chita', 'Kirensk', 'Bodaibo', 'Ust-Kut', 'Lensk', 'Yakutsk', 'Mirnyj', 'Ayacucho', 'Lima', 'Cuzco', 'Puerto Maldonado', 'Huánuco', 'Iquitos', 'Pucallpa', 'Tarapoto', 'Abidjan', 'Bobo-dioulasso', 'Ouagadougou', 'Accra', 'Bamako', 'Dakar', 'Cotonou', 'Lome', 'Niamey', 'Bogota', 'Guayaquil', 'Quito', 'Cali', 'San Cristóbal', 'Coca', 'Brindisi', 'Zurich', 'Bordeaux', 'Bristol', 'Geneva', 'Gran Canaria', 'Larnaca', 'Marsa Alam', 'Tenerife', 'Arvidsjaur', 'Lycksele', 'Stockholm', 'Gallivare', 'Halmstad', 'Joenkoeping', 'Kramfors', 'Karlstad', 'Mariehamn', 'Ornskoldsvik', 'Pori', 'Sandefjord', 'Visby', 'Vilhelmina', 'Vaxjo', 'Hemavan', 'Kruunupyy', 'Turku', 'Oslo', 'Kodiak', 'Larsen Bay', 'Karluk', 'Basco', 'Manila', 'Butuan', 'Cotabato', 'Ca

### Populate the context of the city in the graph

In [62]:
from tqdm import tqdm  # Import tqdm

nodes_to_remove = [] 
possible_redirects = []

# Iterate through the nodes with a progress bar
for node in tqdm(G.nodes(), desc="Processing nodes", unit="node"):
    city = str(node).replace(" ", "_")
    
    try:
        # Try to fetch the Wikivoyage page and get the content
        data = get_wikivoyage_page(city)
        content = get_wikitext_content(data)
        
        if content is None:
            nodes_to_remove.append(node)
        else:
            G.nodes[node]['Content'] = content
            if len(content) < 6000:
                possible_redirects.append(node)
    except Exception as e:
        # If an error occurs, print the error message and the node
        print(f"Failed to get content for node {node}: {e}")
        continue  # Continue to the next node if there's an error


Processing nodes:   1%|          | 31/3136 [00:20<30:47,  1.68node/s]

Failed to get content for node Huánuco: 'ascii' codec can't encode character '\xe1' in position 37: ordinal not in range(128)


Processing nodes:   2%|▏         | 48/3136 [00:32<30:45,  1.67node/s]

Failed to get content for node San Cristóbal: 'ascii' codec can't encode character '\xf3' in position 44: ordinal not in range(128)


Processing nodes:   3%|▎         | 89/3136 [00:59<33:56,  1.50node/s]


KeyboardInterrupt: 