In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

Getting rid of plt.show() requirement with this line.

In [2]:
%matplotlib inline

Setting system-dependent working directory as a variable, so it can be used in pandas function calls

In [3]:
working_dir = "/Users/motoroff/Dropbox/DataScience/SoftwareSchool/data-question-1-amerus/data/"

In [4]:
gdp_df = pd.read_csv(working_dir + "gdp_percapita.csv")

Using del instead of gdp_df.drop, so that the original DataFrame is updated

In [5]:
del gdp_df['Value Footnotes']

In [6]:
gdp_df.head(6)

Unnamed: 0,Country or Area,Year,Value
0,Afghanistan,2016,1802.695566
1,Afghanistan,2015,1809.016488
2,Afghanistan,2014,1838.960244
3,Afghanistan,2013,1848.700026
4,Afghanistan,2012,1839.273579
5,Afghanistan,2011,1660.739856


In [7]:
gdp_df.shape

(6206, 3)

In [8]:
gdp_df.dtypes

Country or Area     object
Year                 int64
Value              float64
dtype: object

In [9]:
gdp_df.tail(10)

Unnamed: 0,Country or Area,Year,Value
6196,Zimbabwe,1999,2699.857521
6197,Zimbabwe,1998,2761.318537
6198,Zimbabwe,1997,2725.888701
6199,Zimbabwe,1996,2698.9173
6200,Zimbabwe,1995,2488.298028
6201,Zimbabwe,1994,2529.826671
6202,Zimbabwe,1993,2360.793284
6203,Zimbabwe,1992,2384.972026
6204,Zimbabwe,1991,2681.495089
6205,Zimbabwe,1990,2605.794944


Renaming columns to be more descriptive and doing so in place to update the DataFrame 

In [10]:
gdp_df.rename(columns={"Country or Area":"Country","Value":"GDP_Per_Capita"},inplace=True)

Creating new index based on country and year, so that two DataFrames can be merged. Year is converted to string.

In [11]:
gdp_new_index = gdp_df.Country + gdp_df.Year.astype(str)

Checking if the new index is unique

In [12]:
gdp_new_index.is_unique

True

Copying the original DataFrame, setting new index to Country+Year

In [13]:
gdp_newindex_df = gdp_df.copy()
gdp_newindex_df.index = gdp_new_index
gdp_newindex_df.head(10)

Unnamed: 0,Country,Year,GDP_Per_Capita
Afghanistan2016,Afghanistan,2016,1802.695566
Afghanistan2015,Afghanistan,2015,1809.016488
Afghanistan2014,Afghanistan,2014,1838.960244
Afghanistan2013,Afghanistan,2013,1848.700026
Afghanistan2012,Afghanistan,2012,1839.273579
Afghanistan2011,Afghanistan,2011,1660.739856
Afghanistan2010,Afghanistan,2010,1614.255001
Afghanistan2009,Afghanistan,2009,1531.173993
Afghanistan2008,Afghanistan,2008,1298.143159
Afghanistan2007,Afghanistan,2007,1284.775213


In [14]:
internet_df = pd.read_csv(working_dir + "internet_use.csv")

In [15]:
del internet_df['Value Footnotes']

In [16]:
internet_df.head(6)

Unnamed: 0,Country or Area,Year,Value
0,Afghanistan,2014,6.39
1,Afghanistan,2013,5.9
2,Afghanistan,2012,5.454545
3,Afghanistan,2011,5.0
4,Afghanistan,2010,4.0
5,Afghanistan,2009,3.55


In [17]:
internet_df.shape

(4495, 3)

In [18]:
internet_df.dtypes

Country or Area     object
Year                 int64
Value              float64
dtype: object

In [19]:
internet_df.tail(10)

Unnamed: 0,Country or Area,Year,Value
4485,Zimbabwe,2002,3.994356
4486,Zimbabwe,2001,0.799846
4487,Zimbabwe,2000,0.401434
4488,Zimbabwe,1999,0.161676
4489,Zimbabwe,1998,0.081648
4490,Zimbabwe,1997,0.03308
4491,Zimbabwe,1996,0.01679
4492,Zimbabwe,1995,0.007684
4493,Zimbabwe,1994,0.001739
4494,Zimbabwe,1990,0.0


Setting "inplace" to True, so that the original is updated instead of a copy

In [20]:
internet_df.rename(columns={"Country or Area":"Country","Value":"Internet_Users_Pct"},inplace=True)

In [21]:
internet_newindex = internet_df.Country + internet_df.Year.astype(str)
internet_newindex.is_unique

True

In [22]:
internet_newindex_df = internet_df.copy()
internet_newindex_df.index = internet_newindex
internet_newindex_df.head(10)

Unnamed: 0,Country,Year,Internet_Users_Pct
Afghanistan2014,Afghanistan,2014,6.39
Afghanistan2013,Afghanistan,2013,5.9
Afghanistan2012,Afghanistan,2012,5.454545
Afghanistan2011,Afghanistan,2011,5.0
Afghanistan2010,Afghanistan,2010,4.0
Afghanistan2009,Afghanistan,2009,3.55
Afghanistan2008,Afghanistan,2008,1.84
Afghanistan2007,Afghanistan,2007,1.9
Afghanistan2006,Afghanistan,2006,2.107124
Afghanistan2005,Afghanistan,2005,1.224148


Performing an outer merge to keep rows from both tables. Missing values are reported as NaN

In [27]:
gdp_and_internet_use = pd.merge(gdp_newindex_df,internet_newindex_df,how="outer")

In [28]:
gdp_and_internet_use.shape

(7157, 4)

Using .isin() to select multiple years 

In [66]:
gdp_and_internet_use[gdp_and_internet_use.Year.isin([2004,2009,2014])].head(10)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
2,Afghanistan,2014,1838.960244,6.39
7,Afghanistan,2009,1531.173993,3.55
12,Afghanistan,2004,1062.24936,0.105809
17,Albania,2014,10701.120786,60.1
22,Albania,2009,9524.649303,41.2
27,Albania,2004,7277.214908,2.420388
44,Algeria,2014,13483.337862,18.09
49,Algeria,2009,12647.540304,11.23
54,Algeria,2004,11797.184849,4.634475
71,Angola,2014,6260.132681,21.26


In [106]:
gdp_and_internet_use_2004 = pd.DataFrame()
gdp_and_internet_use_2004 = gdp_and_internet_use[gdp_and_internet_use.Year == 2004]
gdp_and_internet_use_2004.head(5)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
12,Afghanistan,2004,1062.24936,0.105809
27,Albania,2004,7277.214908,2.420388
54,Algeria,2004,11797.184849,4.634475
81,Angola,2004,3461.018084,0.464815
108,Antigua and Barbuda,2004,19282.229791,24.266544


In [107]:
gdp_and_internet_use_2009 = pd.DataFrame()
gdp_and_internet_use_2009 = gdp_and_internet_use[gdp_and_internet_use.Year == 2009]
gdp_and_internet_use_2009.head(5)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
7,Afghanistan,2009,1531.173993,3.55
22,Albania,2009,9524.649303,41.2
49,Algeria,2009,12647.540304,11.23
76,Angola,2009,5908.051427,6.0
103,Antigua and Barbuda,2009,20942.30118,42.0


In [109]:
gdp_and_internet_use_2014 = pd.DataFrame()
gdp_and_internet_use_2014 = gdp_and_internet_use[gdp_and_internet_use.Year == 2014]
gdp_and_internet_use_2014.head(5)

Unnamed: 0,Country,Year,GDP_Per_Capita,Internet_Users_Pct
2,Afghanistan,2014,1838.960244,6.39
17,Albania,2014,10701.120786,60.1
44,Algeria,2014,13483.337862,18.09
71,Angola,2014,6260.132681,21.26
98,Antigua and Barbuda,2014,19573.834118,64.0
