# Setup

In [None]:
# Installs
# %pip install geopandas
# %pip install folium  
# %pip install plotly==5.11.0
# % pip install dash
# % pip install plotly.express

In [3]:
# Imports
import pandas as pd
import plotly.express as px
import streamlit as st


In [4]:
# Pandas options
pd.set_option('display.max_rows', 10) # Display all rows from data

# Plotly examples

In [None]:
df = px.data.gapminder().query("continent=='Oceania'")
fig = px.line(df, x="year", y="lifeExp", color='country')
fig.show()

In [None]:
fig = px.line(x=["a","b","c"], y=[1,3,2], title="sample figure")
print(fig)
fig.show()

Figure({
    'data': [{'hovertemplate': 'x=%{x}<br>y=%{y}<extra></extra>',
              'legendgroup': '',
              'line': {'color': '#636efa', 'dash': 'solid'},
              'marker': {'symbol': 'circle'},
              'mode': 'lines',
              'name': '',
              'orientation': 'v',
              'showlegend': False,
              'type': 'scatter',
              'x': array(['a', 'b', 'c'], dtype=object),
              'xaxis': 'x',
              'y': array([1, 3, 2]),
              'yaxis': 'y'}],
    'layout': {'legend': {'tracegroupgap': 0},
               'template': '...',
               'title': {'text': 'sample figure'},
               'xaxis': {'anchor': 'y', 'domain': [0.0, 1.0], 'title': {'text': 'x'}},
               'yaxis': {'anchor': 'x', 'domain': [0.0, 1.0], 'title': {'text': 'y'}}}
})


In [None]:
df = px.data.gapminder()
fig = px.scatter_geo(df, locations="iso_alpha", color="continent",
                     hover_name="country", size="pop",
                     animation_frame="year",
                     projection="natural earth")
fig.show()

# Data from world bank

## Import data

In [None]:
# DATA WORD BANK
# https://data.worldbank.org/indicator/IT.NET.USER.ZS

df_users_raw = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/internet/individuals_using_the_Internet_percentage_of_population.csv', skiprows=4) # Individuals using the Internet (% of population) = IT.NET.USER.ZS
df_metadata_country_raw = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/internet/Metadata_Country_individuals_using_the_Internet_percentage_of_population.csv')
df_metadata_indicator_raw= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/internet/Metadata_Indicator_individuals_using_the_Internet_percentage_of_population.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df_users = df_users_raw.copy() # Individuals using the Internet (% of population)
df_metadata_country = df_metadata_country_raw.copy()
df_metadata_indicator = df_metadata_indicator_raw.copy()

## Clean data

### Users

In [None]:
df_users.head(3)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,Unnamed: 66
0,Aruba,ABW,Individuals using the Internet (% of population),IT.NET.USER.ZS,,,,,,,...,78.9,83.78,88.661227,93.542454,97.17,,,,,
1,Africa Eastern and Southern,AFE,Individuals using the Internet (% of population),IT.NET.USER.ZS,,,,,,,...,10.126945,12.237716,14.593148,16.378025,18.09806,21.653313,24.2475,26.200657,,
2,Afghanistan,AFG,Individuals using the Internet (% of population),IT.NET.USER.ZS,,,,,,,...,5.9,7.0,8.26,11.0,13.5,16.8,17.6,18.4,,


In [None]:
# Drop unnecessary columns

users_columns_drop = ['Indicator Name', 'Indicator Code', 'Unnamed: 66']
df_users.drop(columns = users_columns_drop, inplace = True)
df_users.head(3)

Unnamed: 0,Country Name,Country Code,1960,1961,1962,1963,1964,1965,1966,1967,...,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
0,Aruba,ABW,,,,,,,,,...,74.0,78.9,83.78,88.661227,93.542454,97.17,,,,
1,Africa Eastern and Southern,AFE,,,,,,,,,...,8.53475,10.126945,12.237716,14.593148,16.378025,18.09806,21.653313,24.2475,26.200657,
2,Afghanistan,AFG,,,,,,,,,...,5.454545,5.9,7.0,8.26,11.0,13.5,16.8,17.6,18.4,


In [None]:
# Wide to long
years = ['1960', '1961', '1962', '1963', '1964',
       '1965', '1966', '1967', '1968', '1969', '1970', '1971', '1972', '1973',
       '1974', '1975', '1976', '1977', '1978', '1979', '1980', '1981', '1982',
       '1983', '1984', '1985', '1986', '1987', '1988', '1989', '1990', '1991',
       '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000',
       '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009',
       '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018',
       '2019', '2020', '2021']



# Use melt to unpivot the DataFrame
df_users = df_users.melt(id_vars=['Country Name', 'Country Code'], value_vars=years, var_name='Year', value_name='IT.NET.USER.ZS')


In [None]:
# Sort by country and year
df_users.sort_values(by=['Country Name', 'Year'], inplace=True)
df_users



Unnamed: 0,Country Name,Country Code,Year,IT.NET.USER.ZS
2,Afghanistan,AFG,1960,
268,Afghanistan,AFG,1961,
534,Afghanistan,AFG,1962,
800,Afghanistan,AFG,1963,
1066,Afghanistan,AFG,1964,
...,...,...,...,...
15427,Zimbabwe,ZWE,2017,24.400000
15693,Zimbabwe,ZWE,2018,25.000000
15959,Zimbabwe,ZWE,2019,25.100000
16225,Zimbabwe,ZWE,2020,29.298566


In [None]:
# Save me for the future
# Example of melting wide into long

'''

# Create example DataFrame
df = pd.DataFrame({
    'Country': ['US', 'Canada', 'Mexico'],
    '2010': [1000, 2000, 3000],
    '2011': [2000, 3000, 4000],
    '2012': [3000, 4000, 5000]
})

# Use melt to unpivot the DataFrame
df = df.melt(id_vars='Country', value_vars=['2010', '2011', '2012'], var_name='Year', value_name='Sales')

print(df)

'''

"\n\n# Create example DataFrame\ndf = pd.DataFrame({\n    'Country': ['US', 'Canada', 'Mexico'],\n    '2010': [1000, 2000, 3000],\n    '2011': [2000, 3000, 4000],\n    '2012': [3000, 4000, 5000]\n})\n\n# Use melt to unpivot the DataFrame\ndf = df.melt(id_vars='Country', value_vars=['2010', '2011', '2012'], var_name='Year', value_name='Sales')\n\nprint(df)\n\n"

### metadata_country

In [None]:
df_metadata_country.head(3)

Unnamed: 0,Country Code,Region,IncomeGroup,SpecialNotes,TableName,Unnamed: 5
0,ABW,Latin America & Caribbean,High income,,Aruba,
1,AFE,,,"26 countries, stretching from the Red Sea in t...",Africa Eastern and Southern,
2,AFG,South Asia,Low income,The reporting period for national accounts dat...,Afghanistan,


In [None]:
# Drop unnecessary columns
df_metadata_country.drop(columns = ['TableName', 'Unnamed: 5'], inplace=True)


### metadata_indicator

In [None]:
df_metadata_indicator.head()

Unnamed: 0,INDICATOR_CODE,INDICATOR_NAME,SOURCE_NOTE,SOURCE_ORGANIZATION,Unnamed: 4
0,IT.NET.USER.ZS,Individuals using the Internet (% of population),Internet users are individuals who have used t...,International Telecommunication Union (ITU) Wo...,


### Merge

In [None]:
# Merge users and metadata
users_metadata = df_users.merge(df_metadata_country, how='inner', on='Country Code')
users_metadata.head(3)


Unnamed: 0,Country Name,Country Code,Year,IT.NET.USER.ZS,Region,IncomeGroup,SpecialNotes
0,Afghanistan,AFG,1960,,South Asia,Low income,The reporting period for national accounts dat...
1,Afghanistan,AFG,1961,,South Asia,Low income,The reporting period for national accounts dat...
2,Afghanistan,AFG,1962,,South Asia,Low income,The reporting period for national accounts dat...


In [None]:
# Correct dadatypes
# I cn't make this work correctly
# users_metadata['Year'] = pd.to_datetime(users_metadata['Year'])
# users_metadata['Year'] = pd.to_datetime(users_metadata['Year'].dt.strftime('%Y'))


In [None]:
# Extract year
users_metadata.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16430 entries, 0 to 16429
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    16430 non-null  object 
 1   Country Code    16430 non-null  object 
 2   Year            16430 non-null  object 
 3   IT.NET.USER.ZS  7749 non-null   float64
 4   Region          13454 non-null  object 
 5   IncomeGroup     13392 non-null  object 
 6   SpecialNotes    7812 non-null   object 
dtypes: float64(1), object(6)
memory usage: 1.0+ MB


In [None]:
# Save to file and re-import
users_metadata.to_csv('users_metadata.csv', index=False)

In [None]:
# Usable dataframe
df_users = pd.read_csv('users_metadata.csv')

In [None]:
df_users

Unnamed: 0,Country Name,Country Code,Year,IT.NET.USER.ZS,Region,IncomeGroup,SpecialNotes
0,Afghanistan,AFG,1960,,South Asia,Low income,The reporting period for national accounts dat...
1,Afghanistan,AFG,1961,,South Asia,Low income,The reporting period for national accounts dat...
2,Afghanistan,AFG,1962,,South Asia,Low income,The reporting period for national accounts dat...
3,Afghanistan,AFG,1963,,South Asia,Low income,The reporting period for national accounts dat...
4,Afghanistan,AFG,1964,,South Asia,Low income,The reporting period for national accounts dat...
...,...,...,...,...,...,...,...
16425,Zimbabwe,ZWE,2017,24.400000,Sub-Saharan Africa,Lower middle income,National Accounts data are reported in Zimbabw...
16426,Zimbabwe,ZWE,2018,25.000000,Sub-Saharan Africa,Lower middle income,National Accounts data are reported in Zimbabw...
16427,Zimbabwe,ZWE,2019,25.100000,Sub-Saharan Africa,Lower middle income,National Accounts data are reported in Zimbabw...
16428,Zimbabwe,ZWE,2020,29.298566,Sub-Saharan Africa,Lower middle income,National Accounts data are reported in Zimbabw...


In [None]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16430 entries, 0 to 16429
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Country Name    16430 non-null  object 
 1   Country Code    16430 non-null  object 
 2   Year            16430 non-null  int64  
 3   IT.NET.USER.ZS  7749 non-null   float64
 4   Region          13454 non-null  object 
 5   IncomeGroup     13392 non-null  object 
 6   SpecialNotes    7812 non-null   object 
dtypes: float64(1), int64(1), object(5)
memory usage: 898.6+ KB


# Share of the population using the internet

In [None]:
# Share of the population using the internet

# https://plotly.com/python-api-reference/generated/plotly.express.line

title_ind = "Individuals using the Internet (percentage of the population)"

note = "Internet users are individuals who have used the Internet (from any location) in the last 3 months.  The Internet can be used via a computer, mobile phone, personal digital assistant, games machine, digital TV etc."

source = "International Telecommunication Union (ITU) World Telecommunication/ICT Indicators Database"

y_max = df_users['IT.NET.USER.ZS'].max() + 30


fig = px.line(df_users,
              x='Year',
              y='IT.NET.USER.ZS',
              color='Country Code',
              line_group='Region',
              hover_name ='Country Name',
              title = title_ind,
#              animation_frame = 'Region',
#              range_y = [0, y_max],
              template="simple_white"
              )
fig.show()

# What are the top 5 countries with the highest internet use (by population share)?

## Top 5 2021

In [None]:
year_max = df_users['Year'].max()
year_max

2021

In [None]:
# Top 5 countries with the highest internet use by population share
df_top_five_2021 = df_users.query(f'Year == {year_max}')

df_top_five_2021 = df_users.groupby(['Country Name', 'Country Code','Year' ])['IT.NET.USER.ZS'].sum().to_frame().reset_index()
df_top_five_2021 = df_top_five_2021.sort_values(by=['IT.NET.USER.ZS'], ascending=False)[:5]
df_top_five_2021

Unnamed: 0,Country Name,Country Code,Year,IT.NET.USER.ZS
1115,Bahrain,BHR,2021,100.0
12833,Saudi Arabia,SAU,2021,100.0
15560,United Arab Emirates,ARE,2020,100.0
15561,United Arab Emirates,ARE,2021,100.0
1113,Bahrain,BHR,2019,99.701493


In [None]:
# This thing is not showing all the countries =(
fig = px.scatter_geo(df_top_five_2021, 
                     locations = "Country Code", 
                     color = "Country Name",
                     hover_name = "Country Name",
                     size = "IT.NET.USER.ZS",
                     projection = "natural earth")
fig.show()

## Top 5 over time

In [None]:
# Top 5 countries with the highest internet use by population share

def top_5(my_df, col_year):
    years = my_df[col_year].unique()

    df_top_5 = pd.DataFrame(columns=['Country Name', 'Country Code',  'Year',  'IT.NET.USER.ZS'])

    for i in years:
        ds = my_df.query(f'Year == {i}')
        ds = ds.groupby(['Country Name', 'Country Code','Year' ])['IT.NET.USER.ZS'].sum().to_frame().reset_index()
        ds = ds.sort_values(by=['IT.NET.USER.ZS'], ascending=False)[:5]
        df_top_5 = pd.concat([df_top_5, ds])
    return df_top_5

In [None]:
df_top_5 = top_5(df_users, 'Year')
df_top_5

Unnamed: 0,Country Name,Country Code,Year,IT.NET.USER.ZS
0,Afghanistan,AFG,1960,0.000000
182,Norway,NOR,1960,0.000000
168,Mozambique,MOZ,1960,0.000000
169,Myanmar,MMR,1960,0.000000
170,Namibia,NAM,1960,0.000000
...,...,...,...,...
250,United Arab Emirates,ARE,2021,100.000000
206,Saudi Arabia,SAU,2021,100.000000
17,Bahrain,BHR,2021,100.000000
128,Kuwait,KWT,2021,99.700007


In [None]:
# This thing is not showing all the countries =(
fig = px.scatter_geo(df_top_5, 
                     locations = "Country Code", 
                     color = "Country Name",
                     hover_name = "Country Name",
                     size = "IT.NET.USER.ZS",
                     projection = "natural earth",
                     animation_frame = "Year"
                     )
fig.show()