### Get all the cities that have an airport and create airport_city_dict

In [1]:
import pandas as pd

# Replace 'your_file.csv' with the path to your CSV file
file_path = 'airports.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the data
cities = set(data['City'])
print(len(cities))

# Create dictionary with IATA as key and City as value
airport_city_dict = data.set_index('IATA')['City'].to_dict()

# Display the dictionary
print(len(airport_city_dict.keys()))

6956
6073


### Get all the connections, and create a graph

In [2]:
# Replace 'your_file.csv' with the path to your CSV file
file_path = 'routes.csv'
routes = pd.read_csv(file_path)

# Filter rows where stops == 0
direct_flights = routes[routes['Stops'] == 0]

# just making sure that we are extracting the right data
source_airports_set = set(direct_flights['Source Airport'])
dest_airports_set = set(direct_flights['Destination Airport'])
airports_set = set(airport_city_dict.keys())
combined_airports = dest_airports_set.intersection(airports_set)
print(len(combined_airports))
print(combined_airports)

3255
{'NAN', 'MVT', 'KGD', 'BRC', 'EYK', 'KWA', 'AOK', 'WIC', 'BNI', 'MAJ', 'RPR', 'CIH', 'CLL', 'TSR', 'DOY', 'LLB', 'AJU', 'NPL', 'ANC', 'DAL', 'OAK', 'LSE', 'STC', 'KVX', 'TDD', 'SMF', 'AAX', 'ZGU', 'ODO', 'YKL', 'PFO', 'PEI', 'YWB', 'FUE', 'TME', 'ANM', 'SDK', 'HVR', 'LSI', 'CRD', 'NGS', 'FAY', 'YCG', 'OAG', 'SVG', 'MSJ', 'ABJ', 'MDL', 'SBA', 'ERM', 'GET', 'KLV', 'LAR', 'SSR', 'TMI', 'ILG', 'KLU', 'DLZ', 'AOG', 'PDG', 'ZEM', 'HEH', 'LGK', 'FUN', 'STN', 'JIJ', 'GXF', 'GZP', 'PIE', 'MCZ', 'JNX', 'USU', 'MUB', 'YAX', 'QOW', 'LRM', 'ESB', 'INL', 'LUQ', 'CLJ', 'MMJ', 'PDT', 'YDF', 'CTL', 'HRG', 'FMO', 'YKS', 'GGW', 'TEB', 'OKA', 'HER', 'DSN', 'OND', 'HRO', 'RRS', 'ACH', 'LKO', 'HUY', 'MKM', 'BEW', 'MJZ', 'HDG', 'LYA', 'TEE', 'DNK', 'MZH', 'FNJ', 'LPB', 'PIT', 'AZI', 'BGA', 'ROR', 'DCY', 'JIQ', 'ISB', 'POG', 'SJL', 'KHI', 'GRU', 'IXM', 'OGL', 'TUP', 'HLA', 'HMI', 'PBJ', 'FSD', 'RMA', 'ALH', 'LOD', 'MIS', 'ZRH', 'SUG', 'BQS', 'EMA', 'KRT', 'BOS', 'EQS', 'PJG', 'FUO', 'MIA', 'QSF', 'SVU', 

In [3]:
# create a graph with attributes such as airline id and airline
import networkx as nx

G = nx.DiGraph()

for index, row in direct_flights.iterrows():
   # get airport code
   source_airport = row['Source Airport']
   destination_airport = row['Destination Airport']
   airline = row['Airline']
   airline_id = row['Airline ID']

   # Check if the airport codes exist in the dictionary
   if source_airport in airport_city_dict and destination_airport in airport_city_dict:
      # Get city names based on airport codes
      source_city = airport_city_dict[source_airport]
      destination_city = airport_city_dict[destination_airport]

      # Add an edge from Source City to Destination City with airline as an attribute
      G.add_edge(source_city, destination_city, Airline=airline, Airline_ID=airline_id)

### Saving and loading to pickle to make sure that is is ok

In [4]:
# save to pickle
import pickle

# Save the graph to a file using pickle
with open('flight_network_graph.pickle', 'wb') as f:
   pickle.dump(G, f)

In [5]:
# Load the graph from the pickle file
with open('flight_network_graph.pickle', 'rb') as f:
   G_loaded = pickle.load(f)
print(len(G_loaded.edges()))

34614


### Scrape the info on cities

In [6]:
print(G.nodes())

['Sochi', 'Kazan', 'Astrakhan', 'Mineralnye Vody', 'Chelyabinsk', 'Novosibirsk', 'Moscow', 'Nizhnekamsk', 'Taganrog', 'Bugulma', 'Belgorod', 'Kaliningrad', 'Baku', 'St. Petersburg', 'Yekaterinburg', 'Nizhnevartovsk', 'Novy Urengoy', 'Bratsk', 'Irkutsk', 'Chita', 'Kirensk', 'Bodaibo', 'Ust-Kut', 'Lensk', 'Yakutsk', 'Mirnyj', 'Ayacucho', 'Lima', 'Cuzco', 'Puerto Maldonado', 'Huánuco', 'Iquitos', 'Pucallpa', 'Tarapoto', 'Abidjan', 'Bobo-dioulasso', 'Ouagadougou', 'Accra', 'Bamako', 'Dakar', 'Cotonou', 'Lome', 'Niamey', 'Bogota', 'Guayaquil', 'Quito', 'Cali', 'San Cristóbal', 'Coca', 'Brindisi', 'Zurich', 'Bordeaux', 'Bristol', 'Geneva', 'Gran Canaria', 'Larnaca', 'Marsa Alam', 'Tenerife', 'Arvidsjaur', 'Lycksele', 'Stockholm', 'Gallivare', 'Halmstad', 'Joenkoeping', 'Kramfors', 'Karlstad', 'Mariehamn', 'Ornskoldsvik', 'Pori', 'Sandefjord', 'Visby', 'Vilhelmina', 'Vaxjo', 'Hemavan', 'Kruunupyy', 'Turku', 'Oslo', 'Kodiak', 'Larsen Bay', 'Karluk', 'Basco', 'Manila', 'Butuan', 'Cotabato', 'Ca

In [7]:
import urllib
import json

def get_wikivoyage_page(name):

   baseurl = "https://en.wikipedia.org/w/api.php?"
   action = "action=query"
   urltitle = f"titles={name}"
   content = "prop=revisions&rvprop=content&rvslots=main"
   dataformat ="format=json"

   query = f"{baseurl}{action}&{urltitle}&{content}&{dataformat}"

   wikiresponse = urllib.request.urlopen(query)
   wikidata = wikiresponse.read()
   wikitext = wikidata.decode('utf-8')
   data = json.loads(wikitext)
   return data

def get_wikitext_content(data):
   if isinstance(data, dict):
      for key, value in data.items():
         if key == '*':
               return value
         result = get_wikitext_content(value)
         if result:
               return result
   elif isinstance(data, list):
      for item in data:
         result = get_wikitext_content(item)
         if result:
               return result
   return None

In [8]:
data = get_wikivoyage_page("ana")
content = get_wikitext_content(data)
print(type(content))

<class 'str'>


### Populate the context of the city in the graph

In [14]:
from tqdm import tqdm  # Import tqdm
import re

nodes_to_remove = []
possible_problems = []
for_sure_problem = []

# Iterate through the nodes with a progress bar
for node in tqdm(G.nodes(), desc="Processing nodes", unit="node"):
    city = str(node).replace(" ", "_")

    try:
        # Try to fetch the Wikivoyage page and get the content
        city = urllib.parse.quote(city)
        data = get_wikivoyage_page(city)
        content = get_wikitext_content(data)

        if content is not None and "#REDIRECT" in content.upper():
          matches = re.findall(r'\[\[(.*?)\]\]', content)
          redirect_name = matches[0]

          redirect_name = str(redirect_name).replace(" ", "_")
          redirect_name = urllib.parse.quote(redirect_name)

          data = get_wikivoyage_page(redirect_name)
          content = get_wikitext_content(data)

        if content is None:
            nodes_to_remove.append(node)
        else:
            G.nodes[node]['Content'] = content
            if len(content) < 6000:
                #print("Possible problem", node)
                possible_problems.append(node)
            if len(content) < 100:
                print("For sure problem", node)
                for_sure_problem.append(node)
    except Exception as e:
        nodes_to_remove.append(node)
        # If an error occurs, print the error message and the node
        print(f"Failed to get content for node {node}: {e}")
        continue  # Continue to the next node if there's an error


Processing nodes: 100%|██████████| 3136/3136 [14:58<00:00,  3.49node/s]


In [15]:
print(len(nodes_to_remove))
print(len(possible_problems))
print(for_sure_problem)

189
695
[]


In [16]:
G.remove_nodes_from(nodes_to_remove)

In [18]:
print(len(G.nodes()))
print(len(G.edges()))

2947
32935


In [19]:
# save to pickle
import pickle

# Save the graph to a file using pickle
with open('flight_with_content_could_be_better.pickle', 'wb') as f:
   pickle.dump(G, f)

In [33]:
count_less_than_1000 = 0
may_refer_to = []
not_a_problem = []
cities = []
no_city = []
mached_city = []
for node in tqdm(possible_problems, desc="Processing nodes", unit="node"):
  content = G.nodes[node]['Content']

  # check if it refers to something else
  if "may refer to" in content.lower() or "refers to" in content.lower():
    may_refer_to.append(node)
    if "city" in content.lower():
      cities.append(node)

      # find the city it refers to by looking at the link that is right before the name city
      matches = re.findall(r'\[\[(.*?)\]\](?=.*city)', content)
      last_match_before_city = matches[-1] if matches else None
      if last_match_before_city is not None:
        city = str(node).replace(" ", "_")
        city = urllib.parse.quote(city)
        data = get_wikivoyage_page(city)
        content = get_wikitext_content(data)
        if content is not None:
          G.nodes[node]['Content'] = content
          mached_city.append(node)

    else:
      no_city.append(node)
  else:
      not_a_problem.append(node)
print(len(may_refer_to))
print(may_refer_to)

print(len(cities))
print(cities)


Processing nodes: 100%|██████████| 695/695 [00:50<00:00, 13.80node/s]

520
['Chita', 'Mirnyj', 'San Cristóbal', 'Kodiak', 'Karluk', 'Basco', 'Legazpi', 'Zamboanga', 'Palmas', 'Salvador', 'Burlington', 'Decatur', 'Darwin', 'Macon', 'Tupelo', 'Freeport', 'Orlando', 'Oranjestad', 'Khanabad', 'Philipsburg', 'Gustavia', 'Qianjiang', 'Luxi', 'Jiayuguan', 'Kashi', 'Yulin', 'Nikolai', 'Beloyarsky', 'Saidpur', 'Resolute', 'New York', 'Chihuahua', 'Santa Ana', 'Merida', 'Manzanillo', 'Calvi', 'Faro', 'Split', 'Villafranca', 'Bettles', 'Eek', 'Marshall', 'Davao', 'Virac', 'Naga', 'Charlestown', 'Baker Lake', 'Clyde River', 'Thompson', 'Broughton Island', 'Myeik', 'Westerland', 'Hay River', 'Ambler', 'Wainwright', 'Barrow', 'Aniak', 'Cordova', "St Mary's", 'Valdez', 'Holy Cross', 'Kalskag', 'Buckland', 'Deering', 'Elim', 'Koyuk', 'Nome', 'Golovin', 'White Mountain', 'Koyukuk', 'Kiana', 'Mountain Village', 'Teller', 'Kobuk', 'Selawik', 'Noatak', 'La Romana', 'Charlotte', 'Kingston', 'Rio Negro', 'Kirov', 'Dali', 'Simao', 'Yichun', 'Campbell River', 'Comox', 'Cranbrook




In [38]:
print(len(mached_city))
print(mached_city)


256
['Chita', 'San Cristóbal', 'Kodiak', 'Legazpi', 'Zamboanga', 'Palmas', 'Salvador', 'Burlington', 'Decatur', 'Darwin', 'Macon', 'Tupelo', 'Orlando', 'Khanabad', 'Qianjiang', 'Jiayuguan', 'Kashi', 'Yulin', 'Nikolai', 'Saidpur', 'New York', 'Chihuahua', 'Merida', 'Manzanillo', 'Calvi', 'Faro', 'Split', 'Marshall', 'Davao', 'Naga', 'Charlestown', 'Myeik', 'Ambler', 'Aniak', 'Cordova', 'Buckland', 'Deering', 'Elim', 'Golovin', 'Mountain Village', 'La Romana', 'Charlotte', 'Rio Negro', 'Kirov', 'Dali', 'Yichun', 'Campbell River', 'Cranbrook', 'Powell River', 'Mendoza', 'Tripoli', 'Ibaraki', 'Charleston', 'Memphis', 'Albany', 'Augusta', 'Rockland', 'Marion', 'Prince George', 'Quesnel', 'San Pedro', 'Corozal', 'Florencia', 'Leticia', 'Barinas', 'Diu', 'Newark', 'Potosi', 'Pau', 'Allentown', 'Waterloo', 'Amarillo', 'Watertown', 'Austin', 'Beaumont', 'Burbank', 'Columbus', 'Fayetteville', 'Georgetown', 'Jackson', 'Lynchburg', 'Mobile', 'Providence', 'Halifax', 'Garden City', 'Longview', 'Gra

In [40]:
# save to pickle
import pickle

# Save the graph to a file using pickle
with open('flight_with_content_pretty_good.pickle', 'wb') as f:
   pickle.dump(G, f)
