## African universities
This notebook is to obtain a comprehensive list of African universities form two different sources of data:
    _local csv file containing university information, which includes details such as university name, country, longitude and latitude.
    _universities library, which provides programmatic access to a vast collection of university information.
By combining data from these sources, the notebook aims to create a more complete and accurate list of African universities.

The resulting dataset can be used for various purposes, such as research, analysis, or generating insights into higher education across the African continent.

## local parquet file

In [1]:
import pandas as pd 
import warnings

# Suppress FutureWarning and UserWarning messages
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=UserWarning)
 
pd.set_option('display.max_colwidth', None)

program_df = df = pd.read_parquet('parquet_data_universities.parquet')
program_df.head(2)


Unnamed: 0,name,country,longitude,latitude
0,University of Cape Town,South Africa,18.4719,-33.9628
1,Stellenbosch University,South Africa,18.8679,-33.9321


In [2]:
import geopandas as gpd
import matplotlib.pyplot as plt
from scripts import map_plot
import os

# Get the current directory (path) of the notebook
notebook_path = os.getcwd()

# Specify the name of the child directory or file
child_name = "afr_g2014_2013_0/afr_g2014_2013_0.shp"  # Replace with the desired child directory or file name

# Construct the path to the child directory or file
child_path = os.path.join(notebook_path, child_name)
 
 
africa_shapefile = child_path # Replace with the actual file path

map_plot(africa_shapefile,program_df)

TypeError: map_plot() missing 1 required positional argument: 'title'

## Get daat using universities library

In [None]:
import pandas as pd
import universities

uni = universities.API()

try:
    df = pd.read_parquet('data_universities.parquet')
except FileNotFoundError:
    all_data = uni.get_all()
    df = pd.DataFrame(all_data)
    df.rename(columns={0: 'data'}, inplace=True)

    df.to_parquet('data_universities.parquet')

 

df


##  get a list of countries in Africa using the "awoc" library

In [None]:

import awoc
# Initialize the AWOC class.
my_world = awoc.AWOC()
# Let's fetch the list of nations of Africa.
nations_of_Africa = my_world.get_countries_list_of('Africa')
# nations_of_Africa

### Convert country name to its corresponding continent using pycountry_convert

Convert a given country name to its corresponding continent. We uses the pycountry_convert library to perform the conversion. The function, named country_to_continent, takes a country name as input and returns the continent name as output.

In [None]:
import pandas as pd
import pycountry_convert as pc

def country_to_continent(country_name):
    """
    This function is or categorizing countries into their respective continents based on their names. 
        input: a valid  country name
        output:  th corresponding continent name for the given country
    """
    try:
        if country_name == 'Vatican City':
            return 'Europe'
        else:
            country_alpha2 = pc.country_name_to_country_alpha2(country_name)
            country_continent_code = pc.country_alpha2_to_continent_code(country_alpha2)
            country_continent_name = pc.convert_continent_code_to_continent_name(country_continent_code)
            return country_continent_name
    except KeyError:
        return 'Unknown'

# Apply the country_to_continent function to the 'Country' column and store the result in a new 'Continent' column
df['continent'] = df['Country (English)'].apply(country_to_continent)
df

## filter African Univeristies

In [None]:
df_africa = df[df['continent'].str.contains('Africa')]
# Reindex the DataFrame
df_africa = df_africa.reset_index(drop=True)

df_africa

## get location coordnates

In [None]:
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

def add_coordinates(df, uni_address_column, coun_address_column):
    geolocator = Nominatim(user_agent="university-locator")
    df['Latitude'] = None
    df['Longitude'] = None

    def geocode_with_timeout(address):
        try:
            location = geolocator.geocode(address)
            return location
        except GeocoderTimedOut:
            return geocode_with_timeout(address)
        except Exception as e:
            # print(f"Geocoding error for address '{address}': {str(e)}")
            return None

    for index, row in df.iterrows():
        uni_address = row[uni_address_column]
        coun_address = row[coun_address_column]
        location = geocode_with_timeout(uni_address)

        if location is None:
            location = geocode_with_timeout(coun_address)

        if location:
            df.at[index, 'latitude'] = location.latitude
            df.at[index, 'longitude'] = location.longitude

    return df

try:
    df_with_coordinates = pd.read_parquet('df_with_coordinates.parquet')
except FileNotFoundError:
    df_with_coordinates = add_coordinates(df_africa, 'Name', 'Country')

    df_with_coordinates.to_parquet('df_with_coordinates.parquet')

 

df_with_coordinates = df_with_coordinates.drop("Country", axis=1)
df_with_coordinates = df_with_coordinates.rename(columns={'Country (English)': 'country','Name': 'name','Latitude': 'latitude','Longitude': 'longitude','Continent': 'continent'})
df_with_coordinates.head()

## find none values if there are any

In [None]:
def find_none_values(df):
    null_rows = df[df['latitude'].isnull() & df['longitude'].isnull()]
    return null_rows
none_values_df = find_none_values(df_with_coordinates)
none_values_df

## concatenate the two DataFrames together

In [None]:
dff_loc = pd.read_parquet('add_universities_data.parquet')
dff_loc = dff_loc.rename(columns={'Latitude': 'latitude','Longitude': 'longitude'})
 
concatenated_df = pd.concat([df_with_coordinates, program_df,dff_loc], ignore_index=True)
concatenated_df

## plot

In [None]:
import geopandas as gpd
import matplotlib.pyplot as plt
from scripts import map_plot


map_plot(africa_shapefile,concatenated_df,"Quantitative Statistical Ecology Programs in Africa")