# Analyze Internet Use with Python

Import and Inspect

In [2]:
import pandas as pd
internet = pd.read_csv('internet.csv')
internet.head()

Unnamed: 0,entity,code,year,internet_users_per_100
0,Afghanistan,AFG,1990,0.0
1,Afghanistan,AFG,1991,0.0
2,Afghanistan,AFG,1992,0.0
3,Afghanistan,AFG,1993,0.0
4,Afghanistan,AFG,1994,0.0


In [3]:
internet = internet.rename(columns={'internet_users_per_100':'percent_online'})
internet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6367 entries, 0 to 6366
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   entity          6367 non-null   object 
 1   code            6367 non-null   object 
 2   year            6367 non-null   int64  
 3   percent_online  6367 non-null   float64
dtypes: float64(1), int64(1), object(2)
memory usage: 199.1+ KB


Years to Reach Mainstream Use

In [4]:
def amount(row):
    if row['percent_online'] == 0:
        return 'none'
    elif row['percent_online'] < 25:
        return 'few'
    elif row['percent_online'] < 50:
        return 'some'
    else:
        return 'most'

In [7]:
internet['amount'] = internet.apply(amount, axis=1)
internet.head()

Unnamed: 0,entity,code,year,percent_online,amount
0,Afghanistan,AFG,1990,0.0,none
1,Afghanistan,AFG,1991,0.0,none
2,Afghanistan,AFG,1992,0.0,none
3,Afghanistan,AFG,1993,0.0,none
4,Afghanistan,AFG,1994,0.0,none


In [22]:
years = pd.pivot_table(internet,
              values='year',
              index=['entity', 'code'],
              columns='amount',
              aggfunc='min').reset_index()
years.head()

amount,entity,code,few,most,none,some
0,Afghanistan,AFG,2001.0,,1990.0,
1,Albania,ALB,1995.0,2013.0,1990.0,2009.0
2,Algeria,DZA,1994.0,2019.0,1990.0,2014.0
3,American Samoa,ASM,,,1990.0,
4,Andorra,AND,1996.0,2007.0,1990.0,2004.0


In [23]:
years['few2some'] = years['some'] - years['few']
years['some2most'] = years['most'] - years['some']

In [24]:
years['few2some'].mean(), years['some2most'].mean()

(14.065573770491802, 4.929078014184397)

In [25]:
years = years[years['code']=='CAT'].sort_values(by='few2some', ascending=True)
years.head()

amount,entity,code,few,most,none,some,few2some,some2most
151,North America,CAT,1990.0,2001.0,,1998.0,8.0,3.0
87,High income,CAT,1990.0,2004.0,,2000.0,10.0,4.0
66,European Union,CAT,1990.0,2006.0,,2002.0,12.0,4.0
65,Europe and Central Asia,CAT,1990.0,2009.0,,2003.0,13.0,6.0
109,Latin America and Caribbean,CAT,1991.0,2015.0,1990.0,2008.0,17.0,7.0


Growth by Decade

In [26]:
internet = internet[internet['year']>1999]
internet.head()

Unnamed: 0,entity,code,year,percent_online,amount
6,Afghanistan,AFG,2001,0.004723,few
7,Afghanistan,AFG,2002,0.004561,few
8,Afghanistan,AFG,2003,0.087891,few
9,Afghanistan,AFG,2004,0.105809,few
10,Afghanistan,AFG,2005,1.224148,few


In [27]:
def decader(row):
    decade = str(row['year'])
    decade = decade[0:3] + '0s'
    return decade

In [29]:
internet['decade'] = internet.apply(decader,axis=1)
internet.head()

Unnamed: 0,entity,code,year,percent_online,amount,decade
6,Afghanistan,AFG,2001,0.004723,few,2000s
7,Afghanistan,AFG,2002,0.004561,few,2000s
8,Afghanistan,AFG,2003,0.087891,few,2000s
9,Afghanistan,AFG,2004,0.105809,few,2000s
10,Afghanistan,AFG,2005,1.224148,few,2000s


In [30]:
def change(column):
    if len(column) == 1:
        return column.iloc[0]
    else:
        return (column.iloc[-1] - column.iloc[0])

In [31]:
internet = internet.sort_values(by='year', ascending=True)
internet.head()

Unnamed: 0,entity,code,year,percent_online,amount,decade
4993,Saudi Arabia,SAU,2000,2.210692,few,2000s
5935,Ukraine,UKR,2000,0.716184,few,2000s
1149,Colombia,COL,2000,2.207533,few,2000s
4943,San Marino,SMR,2000,48.799496,some,2000s
3805,Middle income,CAT,2000,1.546455,few,2000s


In [46]:
decade_growth = internet.groupby(['entity', 'decade']).agg({'percent_online':change, 'year':['min','max']}).reset_index()
decade_growth.head()

Unnamed: 0_level_0,entity,decade,percent_online,year,year
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,change,min,max
0,Afghanistan,2000s,3.545277,2001,2009
1,Afghanistan,2010s,13.6,2010,2019
2,Albania,2000s,41.085903,2000,2009
3,Albania,2010s,23.550392,2010,2019
4,Algeria,2000s,10.738294,2000,2009


In [47]:
decade_growth.columns = ['entity','decade','change','min','max']
decade_growth.head()

Unnamed: 0,entity,decade,change,min,max
0,Afghanistan,2000s,3.545277,2001,2009
1,Afghanistan,2010s,13.6,2010,2019
2,Albania,2000s,41.085903,2000,2009
3,Albania,2010s,23.550392,2010,2019
4,Algeria,2000s,10.738294,2000,2009


In [48]:
decade_growth['annual'] = decade_growth['change']/(decade_growth['max']-decade_growth['min'])
decade_growth

Unnamed: 0,entity,decade,change,min,max,annual
0,Afghanistan,2000s,3.545277,2001,2009,0.443160
1,Afghanistan,2010s,13.600000,2010,2019,1.511111
2,Albania,2000s,41.085903,2000,2009,4.565100
3,Albania,2010s,23.550392,2010,2019,2.616710
4,Algeria,2000s,10.738294,2000,2009,1.193144
...,...,...,...,...,...,...
441,Yemen,2010s,14.368355,2010,2017,2.052622
442,Zambia,2000s,2.308928,2000,2009,0.256548
443,Zambia,2010s,13.799999,2010,2019,1.533333
444,Zimbabwe,2000s,3.598566,2000,2009,0.399841


In [49]:
decade_growth = pd.pivot_table(decade_growth,
                               values='annual',
                               index='entity',
                               columns='decade')
decade_growth.head()

decade,2000s,2010s
entity,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0.44316,1.511111
Albania,4.5651,2.61671
Algeria,1.193144,5.044445
Andorra,7.554574,1.509638
Angola,0.243884,3.288889


In [54]:
decade_growth['ratio'] = decade_growth['2010s']/decade_growth['2000s']
decade_growth.head()

decade,2000s,2010s,ration,ratio
entity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Afghanistan,0.44316,1.511111,3.409857,3.409857
Albania,4.5651,2.61671,0.573199,0.573199
Algeria,1.193144,5.044445,4.22786,4.22786
Andorra,7.554574,1.509638,0.199831,0.199831
Angola,0.243884,3.288889,13.485475,13.485475


In [55]:
decade_growth['ratio'].describe()

count    220.000000
mean       5.119733
std       14.289239
min        0.000000
25%        0.718154
50%        1.579371
75%        4.226106
max      140.184335
Name: ratio, dtype: float64