# Pie Charts, Box Plots, Scatter Plots, and Bubble Plots


In [1]:
import numpy as np  # useful for many scientific computing in Python
import pandas as pd # primary data structure library

In [11]:
df= pd.read_excel('datasets/Canada.xlsx',
    sheet_name='Canada by Citizenship',
    skiprows=range(20),
    skipfooter=2,
    engine = 'openpyxl'
    )

In [12]:
df.tail()

Unnamed: 0,Type,Coverage,OdName,AREA,AreaName,REG,RegName,DEV,DevName,1980,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
190,Immigrants,Foreigners,Viet Nam,935,Asia,920,South-Eastern Asia,902,Developing regions,1191,...,1816,1852,3153,2574,1784,2171,1942,1723,1731,2112
191,Immigrants,Foreigners,Western Sahara,903,Africa,912,Northern Africa,902,Developing regions,0,...,0,0,1,0,0,0,0,0,0,0
192,Immigrants,Foreigners,Yemen,935,Asia,922,Western Asia,902,Developing regions,1,...,124,161,140,122,133,128,211,160,174,217
193,Immigrants,Foreigners,Zambia,903,Africa,910,Eastern Africa,902,Developing regions,11,...,56,91,77,71,64,60,102,69,46,59
194,Immigrants,Foreigners,Zimbabwe,903,Africa,910,Eastern Africa,902,Developing regions,72,...,1450,615,454,663,611,508,494,434,437,407


In [5]:
# remove unnecessary columns  
df.drop(['Type', 'Coverage', 'AREA', 'REG', 'DEV'], 
        axis=1, 
        inplace=True
       )

# rename the columns 
df.rename(columns={'OdName':'Country', 
                   'AreaName':'Continent',
                   'RegName':'Region'
                  }, 
          inplace=True
         )



In [6]:
# make all column labels of type string
df.columns = list(map(str, df.columns))

# set the country name as index 
df.set_index('Country', inplace=True)

# add total column
df['Total'] = df.iloc[:, 3:].sum(axis=1)

years = list(map(str, range(1980, 2014)))


df.head()

Unnamed: 0_level_0,Continent,Region,DevName,1980,1981,1982,1983,1984,1985,1986,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,Total
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,Asia,Southern Asia,Developing regions,16,39,39,47,71,340,496,...,3436,3009,2652,2111,1746,1758,2203,2635,2004,58639
Albania,Europe,Southern Europe,Developed regions,1,0,0,0,0,0,1,...,1223,856,702,560,716,561,539,620,603,15699
Algeria,Africa,Northern Africa,Developing regions,80,67,71,69,63,44,69,...,3626,4807,3623,4005,5393,4752,4325,3774,4331,69439
American Samoa,Oceania,Polynesia,Developing regions,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,6
Andorra,Europe,Southern Europe,Developed regions,0,0,0,0,0,0,2,...,0,1,1,0,0,0,0,1,1,15


### Visualizing Data using Matplotlib 


Import `Matplotlib`

In [7]:
%matplotlib inline

import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot') # optional: for ggplot-like style

# check for latest version of Matplotlib
print('Matplotlib version: ', mpl.__version__) # >= 2.0.0

Matplotlib version:  3.4.2


## Pie Charts  

* A `pie chart` is a circular graphic that displays numeric proportions by dividing a circle (or pie) into proportional slices. 


Step 1: Gather data.

* We will use *pandas* `groupby` method to summarize the immigration data by `Continent`. 
* The general process of `groupby` involves the following steps:

   1.  **Split:** Splitting the data into groups based on some criteria.
   2.  **Apply:** Applying a function to each group independently:
        *  sum()
        *  count()
        *  mean()
        *  std()
        *  aggregate()
        *  apply()
        *  etc..
   3.  **Combine:** Combining the results into a data structure.


<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/IBMDeveloperSkillsNetwork-DV0101EN-SkillsNetwork/labs/Module%203/images/Mod3Fig4SplitApplyCombine.png" height=400 align="center">


In [8]:
dfC = df[years]
dfC

Unnamed: 0_level_0,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,16,39,39,47,71,340,496,741,828,1076,...,2978,3436,3009,2652,2111,1746,1758,2203,2635,2004
Albania,1,0,0,0,0,0,1,2,2,3,...,1450,1223,856,702,560,716,561,539,620,603
Algeria,80,67,71,69,63,44,69,132,242,434,...,3616,3626,4807,3623,4005,5393,4752,4325,3774,4331
American Samoa,0,1,0,0,0,0,0,1,0,1,...,0,0,1,0,0,0,0,0,0,0
Andorra,0,0,0,0,0,0,2,0,0,0,...,0,0,1,1,0,0,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Viet Nam,1191,1829,2162,3404,7583,5907,2741,1406,1411,3004,...,1816,1852,3153,2574,1784,2171,1942,1723,1731,2112
Western Sahara,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
Yemen,1,2,1,6,0,18,7,12,7,18,...,124,161,140,122,133,128,211,160,174,217
Zambia,11,17,11,7,16,9,15,23,44,68,...,56,91,77,71,64,60,102,69,46,59


In [9]:
# group countries by continents and apply sum() function 
dfC = df.groupby('Continent', 
                           axis=0).sum()

# note: the output of the groupby method is a `groupby' object. 
# we can not use it further until we apply a function (eg .sum())
print(type(df.groupby('Continent', 
                      axis=0
                     )))

dfC.sort_values(['Total'], 
                          ascending=False, 
                          axis=0, 
                          inplace=True
                         )
dfC


<class 'pandas.core.groupby.generic.DataFrameGroupBy'>


Unnamed: 0_level_0,1980,1981,1982,1983,1984,1985,1986,1987,1988,1989,...,2005,2006,2007,2008,2009,2010,2011,2012,2013,Total
Continent,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Asia,31025,34314,30214,24696,27274,23850,28739,43203,47454,60256,...,159253,149054,133459,139894,141434,163845,146894,152218,155075,3317794
Europe,39760,44802,42720,24638,22287,20844,24370,46698,54726,60893,...,35955,33053,33495,34692,35078,33425,26778,29177,28691,1410947
Latin America and the Caribbean,13081,15215,16769,15427,13678,15171,21179,28471,21924,25060,...,24747,24676,26011,26547,26867,28818,27856,27173,24950,765148
Africa,3951,4363,3819,2671,2639,2650,3782,7494,7552,9894,...,27523,29188,28284,29890,34534,40892,35441,38083,38543,618948
Northern America,9378,10030,9074,7100,6661,6543,7074,7705,6469,6790,...,8394,9613,9463,10190,8995,8142,7677,7892,8503,241142
Oceania,1942,1839,1675,1018,878,920,904,1200,1181,1539,...,1585,1473,1693,1834,1860,1834,1548,1679,1775,55174


Step 2: Plot the data. We will pass in `kind = 'pie'` keyword, along with the following additional parameters:

*   `autopct` - used to label the wedges with their numeric value. 
     * The label will be placed inside the wedge. 
     * If it is a format string, the label will be `fmt%pct`.
*   `startangle` - rotates the start of the pie chart by angle degrees counterclockwise from the x-axis.
*   `shadow` - draws a shadow beneath the pie (to give a 3D feel).


In [None]:
# autopct create %, start angle represent starting point
df_continents['Total'].plot(kind='pie',
                            figsize=(5, 6),
                            autopct='%10.1f%%', # add in percentages
                            startangle=90,     # start angle 90° (Africa)
                            shadow=True,       # add shadow      
                            )

plt.title('Immigration to Canada by Continent [1980 - 2013]')
plt.axis('equal') # Sets the pie chart to look like a circle.

plt.show()

To improve the visuals:

*   Remove the text labels on the pie chart by passing in `legend` and add it as a seperate legend using `plt.legend()`.
*   Push out the percentages to sit just outside the pie chart by passing in `pctdistance` parameter.
*   Pass in a custom set of colors for continents by passing in `colors` parameter.
*   **Explode** the pie chart to emphasize the lowest three continents (Africa, North America, and Latin America and Caribbean) by passing in `explode` parameter.


In [None]:
colors_list = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen', 'pink']
explode_list = [0.1, 0, 0, 0, 0.2, 0.3] # ratio for each continent with which to offset each wedge.

df_continents['Total'].plot(kind='pie',
                            figsize=(15, 6),
                            autopct='%1.1f%%', 
                            startangle=90,    
                            shadow=True,       
                            labels=None,         # turn off labels on pie chart
                            pctdistance=1.2,    # the ratio between the center of each pie slice and the start of the text generated by autopct 
                            colors=colors_list,  # add custom colors
                            explode=explode_list # 'explode' lowest 3 continents
                            )

# scale the title up by 12% to match pctdistance
plt.title('Immigration to Canada by Continent [1980 - 2013]', y=1.12) 

plt.axis('equal') 

# add legend
plt.legend(labels=df_continents.index, loc='upper left') 

plt.show()

**Question:** Using a pie chart, explore the proportion (percentage) of new immigrants grouped by continents in the year 2013.

In [None]:
explode_list = [0.1, 0, 0, 0, 0.2, 0.3] # ratio for each continent with which to offset each wedge.
colors_list = ['gold', 'yellowgreen', 'lightcoral', 'lightskyblue', 'lightgreen', 'pink']

df_continents['2013'].plot(kind='pie',
                           figsize=(15, 6),
                           autopct='%1.1f%%', 
                           startangle=90,    
                           shadow=True,       
                           labels=None,                 # turn off labels on pie chart
                           pctdistance=1.15,            # the ratio between the pie center and start of text label
                           explode=explode_list,         # 'explode' lowest 3 continents
                           colors=colors_list
                         )

# scale the title up by 12% to match pctdistance
plt.title('Immigration to Canada by Continent in 2013', y=1.12) 
plt.axis('equal') 

# add legend
plt.legend(labels=df_continents.index, loc='upper left') 

# show plot
plt.show()


## Box Plots  

A `box plot` is a way of statistically representing the *distribution* of the data through five main dimensions:

*   **Minimum:** The smallest number in the dataset excluding the outliers.
*   **First quartile:** Middle number between the `minimum` and the `median`.
*   **Second quartile (Median):** Middle number of the (sorted) dataset.
*   **Third quartile:** Middle number between `median` and `maximum`.
*   **Maximum:** The largest number in the dataset excluding the outliers.


<img src="boxplot.png" width=440, align="center">


To make a `boxplot`, we can use `kind=box` in `plot` method invoked on a *pandas* series or dataframe.

Let's plot the box plot for the Japanese immigrants between 1980 - 2013.


Step 1: Get the subset of the dataset. Even though we are extracting the data for just one country, we will obtain it as a dataframe. This will help us with calling the `dataframe.describe()` method to view the percentiles.


In [None]:
# to get a dataframe, place extra square brackets around 'Japan'.
df_japan = df.loc[['Japan'], years].transpose()
df_japan.head()

Step 2: Plot by passing in `kind='box'`.


In [None]:
df_japan.plot(kind='box', 
              figsize=(8, 6)
             )

plt.title('Box plot of Japanese Immigrants from 1980 - 2013')
plt.ylabel('Number of Immigrants')

plt.show()

We can immediately make a few key observations from the plot above:

1.  minimum number of immigrants is around 200 (min)
2.  maximum number is around 1300 (max)
3.  median number of immigrants is around 900 (median).
4.  25% of the years for period 1980 - 2013 had an annual immigrant count of \~500 or fewer (First quartile).
5.  75% of the years for period 1980 - 2013 had an annual immigrant count of \~1100 or fewer (Third quartile).

We can view the actual numbers by calling the `describe()` method on the dataframe.


In [None]:
df_japan.describe()

One of the key benefits of box plots is comparing the distribution of multiple datasets.

**Question:** Compare the distribution of the number of new immigrants from India and China for the period 1980 - 2013.


Step 1: Get the dataset for China and India and call the dataframe **df_CI**.


In [None]:
df_CI= df.loc[['China', 'India'], years].transpose()
df_CI.head()

Let's view the percentiles associated with both countries using the `describe()` method.


In [None]:
df_CI.describe()

Step 2: Plot data.


In [None]:
df_CI.plot(kind='box', figsize=(10, 7))

plt.title('Box plots of Immigrants from China and India (1980 - 2013)')
plt.ylabel('Number of Immigrants')

plt.show()

> **Conclusions**
> *  while both countries have around the same median immigrant population (\~20,000),  China's immigrant population range is more spread out than India's. 
> * The maximum population from India for any year (36,210) is around 15% lower than the maximum population from China (42,584).


To create horizontal box plots,  pass the `vert` parameter in the **plot** function and assign it to *False*. 


In [None]:
# horizontal box plots
colors = ['pink', 'lightblue', 'lightgreen']

bplot1 = df_CI.plot(kind='box', 
           figsize=(10, 7), 
           color='g',  
           vert=False, 
           #patch_artist=True, ##fill colors
            notch=True
          )

 

plt.title('Box plots of Immigrants from China and India (1980 - 2013)')
plt.xlabel('Number of Immigrants')

plt.show()

## Subplots

 
To visualize multiple plots together, we can create a **`figure`** (overall canvas) and divide it into **`subplots`**, each containing a plot. With **subplots**, we usually work with the **artist layer** instead of the **scripting layer**.

Typical syntax is : <br>

```python
    fig = plt.figure() # create figure
    ax = fig.add_subplot(nrows, ncols, plot_number) # create subplots
```

Where

*   `nrows` and `ncols` are used to notionally split the figure into (`nrows` \* `ncols`) sub-axes,
*   `plot_number` is used to identify the particular subplot that this function is to create within the notional grid. 
     *  `plot_number` starts at 1, increments across rows first and has a maximum of `nrows` \* `ncols` as shown below.

<img src="subplots.png" width=500 align="center">


We can then specify which subplot to place each plot by passing in the `ax` paramemter in `plot()` method as follows:


In [None]:
fig = plt.figure() # create figure

ax1 = fig.add_subplot(1, 2, 1) # add subplot 1 (1 row, 2 columns, first plot)
ax2 = fig.add_subplot(1, 2, 2) # add subplot 2 (1 row, 2 columns, second plot). See tip below**

# Subplot 1: Box plot
df_CI.plot(kind='box', 
           color='blue', 
           vert=False, 
           figsize=(20, 6), 
           ax=ax1
          ) # add to subplot 1

ax1.set_title('Box Plots of Immigrants from China and India (1980 - 2013)')
ax1.set_xlabel('Number of Immigrants')
ax1.set_ylabel('Countries')




# Subplot 2: Line plot
df_CI.plot(kind='line',
           figsize=(20, 6), 
           ax=ax2
          ) # add to subplot 2

ax2.set_title ('Line chart of Immigrants from China and India (1980 - 2013)')
ax2.set_ylabel('Number of Immigrants')
ax2.set_xlabel('Years')

plt.show()

**Tip regarding subplot convention**

In the case when `nrows`, `ncols`, and `plot_number` are all less than 10:

```python
   subplot(214) == subplot(2, 1, 4) 
```



**Question:** Create a box plot to visualize the distribution of the top 15 countries (based on total immigration) grouped by the *decades* `1980s`, `1990s`, and `2000s`.


Step 1: Get the dataset. Get the top 15 countries based on Total immigrant population. Name the dataframe **df_top15**.


In [None]:
df_top15 = df.sort_values(['Total'], 
                          ascending=False, 
                          axis=0
                         ).head(15) 
df_top15

Step 2: Create a new dataframe which contains the aggregate for each decade. 

One way to do that:

1.  Create a list of all years in decades 80's, 90's, and 00's.
2.  Slice the original `dataframe` to create a series for each decade and sum across all years for each country.
3.  Merge the three series into a `new_dataframe`. 


In [None]:
# 1. create a list of all years in decades 80's, 90's, and 00's

years_80s = list(map(str, range(1980, 1990))) #from 1980 to 1990-not included
years_90s = list(map(str, range(1990, 2000))) 
years_00s = list(map(str, range(2000, 2010))) 

years_80s

In [None]:
# 2. slice the original dataframe df_can to create a series for each decade
df_80s = df_top15.loc[:, years_80s].sum(axis=1) 
df_90s = df_top15.loc[:, years_90s].sum(axis=1) 
df_00s = df_top15.loc[:, years_00s].sum(axis=1)

df_80s

In [None]:
# 3. merge the three series into a new data frame
new_df = pd.DataFrame({'1980s': df_80s, 
                       '1990s': df_90s, 
                       '2000s':df_00s
                      }
                     ) 

# display dataframe
new_df.head()

In [None]:
new_df.describe()

Step 3: Plot the box plots.


In [None]:
new_df.plot(kind='box', 
            figsize=(10, 6))

plt.title('Immigration from top 15 countries for decades 80s, 90s and 2000s')

plt.show()

In order to be an outlier, the data value must be:

*   larger than Q3 by at least 1.5 times the interquartile range (IQR), or,
*   smaller than Q1 by at least 1.5 times the IQR.

Let's look at decade 2000s as an example: <br>

*   Q1 (25%) = 36,101.5 <br>
*   Q3 (75%) = 105,505.5 <br>
*   IQR = Q3 - Q1 = 69,404 <br>

Using the definition of outlier, any value that is greater than Q3 by 1.5 times IQR will be flagged as outlier.

Outlier > 105,505.5 + (1.5 \* 69,404) <br>
Outlier > 209,611.5


In [None]:
new_df[new_df['2000s']> 209611.5]

China and India are both considered as outliers since their population for the decade exceeds 209,611.5.

The box plot is an advanced visualizaiton tool, refer to [Matplotlib documentation](http://matplotlib.org/api/pyplot_api.html?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkDV0101ENSkillsNetwork20297740-2021-01-01#matplotlib.pyplot.boxplot) on box plots for more information.


## Scatter Plots  

*  A `scatter plot`  is considered to express a trend. 

 

**Question:** Using a `scatter plot`, visualize the trend of total immigrantion to Canada (all countries combined) for the years 1980 - 2013.


Step 1: Get the dataset. 

Since we are expecting to use the relationship betewen `years` and `total population`, we will convert `years` to `int` type.


In [None]:
# we can use the sum() method to get the total population per year
df_tot = pd.DataFrame(df[years].sum(axis=0))

df_tot.head()

In [None]:
# change the years to type int (useful for regression later on)
df_tot.index = map(int, df_tot.index)

df_tot.head()

In [None]:
# reset the index to put in back in as a column in the df_tot dataframe
df_tot.reset_index(inplace = True)

df_tot.head()

In [None]:
# rename columns
df_tot.columns = ['year', 'total']

# view the final dataframe
df_tot.head()

Step 2: Plot the data. 

* in `Matplotlib`, pass  `kind='scatter'` as plot argument. 
* also pass in `x` and `y` keywords to specify the columns that go on the x- and the y-axis.


In [None]:
df_tot.plot(kind='scatter', 
            x='year', 
            y='total', 
            figsize=(10, 6), 
            color='darkblue')

plt.title('Total Immigration to Canada from 1980 - 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')

plt.show()

> an upward trend in the data - as the years go by, the total number of immigrants increases. 

We can mathematically analyze this upward trend using a regression line (line of best fit).


So let's try to plot a linear line of best fit, and use it to  predict the number of immigrants in 2015.

Step 1: Get the equation of line of best fit. We will use **Numpy**'s `polyfit()` method by passing in the following:

*   `x`: x-coordinates of the data.
*   `y`: y-coordinates of the data.
*   `deg`: Degree of fitting polynomial. 1 = linear, 2 = quadratic, and so on.


In [None]:
x = df_tot['year']      # year on x-axis
y = df_tot['total']     # total on y-axis

fit = np.polyfit(x, y, deg=1)

fit

* The output is an array with the polynomial coefficients, highest powers first. 
   *      Since we are plotting a linear regression `y= a * x + b`, our output has 2 elements `[5.56709228e+03, -1.09261952e+07]` with the the slope in position 0 and intercept in position 1.

Step 2: Plot the regression line on the `scatter plot`.


In [None]:
df_tot.plot(kind='scatter', 
            x='year', 
            y='total', 
            figsize=(10, 6), 
            color='darkblue')

plt.title('Total Immigration to Canada from 1980 - 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')

# plot line of best fit
plt.plot(x, 
         fit[0] * x + fit[1], 
         color='red'
        ) # recall that x is the Years

plt.annotate('y={0:.0f} x + {1:.0f}'.format(fit[0], fit[1]), 
             xy=(2000, 150000))

plt.show()

# print out the line of best fit
'No. Immigrants = {0:.0f} * Year + {1:.0f}'.format(fit[0], fit[1]) 

Using the equation of line of best fit, we can estimate the number of immigrants in 2015:

```python
No. Immigrants = 5567 * Year - 10926195
No. Immigrants = 5567 * 2015 - 10926195
No. Immigrants = 291,310
```

When compared to the actual from Citizenship and Immigration Canada's (CIC) [2016 Annual Report](http://www.cic.gc.ca/english/resources/publications/annual-report-2016/index.asp?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkDV0101ENSkillsNetwork20297740-2021-01-01), we see that Canada accepted 271,845 immigrants in 2015. Our estimated value of 291,310 is within 7% of the actual number, which is pretty good considering our original data came from United Nations (and might differ slightly from CIC data).

**Question**: Create a scatter plot of the total immigration from Denmark, Norway, and Sweden to Canada from 1980 to 2013?


**Step 1**: Get the data:

1.  Create a dataframe the consists of the numbers associated with Denmark, Norway, and Sweden only. Name it **df_countries**.
2.  Sum the immigration numbers across all three countries for each year and turn the result into a dataframe. Name this new dataframe **df_total**.
3.  Reset the index in place.
4.  Rename the columns to **year** and **total**.
5.  Display the resulting dataframe.


In [None]:
# create df_countries dataframe
df_countries = df.loc[['Denmark', 'Norway', 'Sweden'], years].transpose()

df_countries.head()

In [None]:
# create df_total by summing across three countries for each year
df_total = pd.DataFrame(df_countries.sum(axis=1))

df_total.head()

In [None]:
# reset index in place to realize the renaming
df_total.reset_index(inplace=True)

df_total.head()

In [None]:
# rename columns
df_total.columns = ['year', 'total']

df_total.head()

In [None]:
# change column year from string to int to create scatter plot
df_total['year'] = df_total['year'].astype(int)

df_total.head()

**Step 2**: Generate the scatter plot by plotting the total versus year in **df_total**.


In [None]:
# generate scatter plot
df_total.plot(kind='scatter', 
              x='year', 
              y='total', 
              figsize=(10, 6), 
              color='darkblue')

# add title and label to axes
plt.title('Immigration from Denmark, Norway, and Sweden to Canada from 1980 - 2013')
plt.xlabel('Year')
plt.ylabel('Number of Immigrants')

# show plot
plt.show()

## Bubble Plots <a id="12"></a>

*  A `bubble plot` is a variation of the `scatter plot` that displays three dimensions of data (x, y, z). 
    * The data points are replaced with bubbles, and the size of the bubble is determined by the third variable `z`, also known as the weight. 
    * In `maplotlib`, we can pass in an array or scalar to the parameter `s` to `plot()`, that contains the weight of each point.

**Question:** compare Argentina's immigration to that of it's neighbour Brazil, using a `bubble plot` of immigration from Brazil and Argentina for the years 1980 - 2013. Set the weights for the bubble as the *normalized* value of the population for each year.


**Step 1**: Get the data for Brazil and Argentina. 


In [None]:
df[years]
fav_countries = ['Brazil', 'Argentina']

In [None]:
# transposed dataframe
df_t = df[years].transpose()

df_t.head()

In [None]:
df_ft = df.loc[fav_countries, years].transpose()
df_ft.head()

In [None]:
# cast the Years (the index) to type int
df_ft.index = map(int, df_t.index)
df_ft.head()

In [None]:
# label the index. This will automatically be the column name when we reset the index
df_ft.index.name = 'Year'
df_ft.head()

In [None]:
# reset index to bring the Year in as a column
df_ft.reset_index(inplace=True)

df_ft.head()

**Step 2**: Create the normalized weights.

There are several methods of normalizations in statistics, each with its own use. 

In this case, we will use [feature scaling](https://en.wikipedia.org/wiki/Feature_scaling?utm_medium=Exinfluencer&utm_source=Exinfluencer&utm_content=000026UJ&utm_term=10006555&utm_id=NA-SkillsNetwork-Channel-SkillsNetworkCoursesIBMDeveloperSkillsNetworkDV0101ENSkillsNetwork20297740-2021-01-01) to bring all values into the range \[0, 1]. 

The general formula is: 

$$ X^\prime = \cfrac{X - X_{\text{min}}}{ X_{\text{max}}- X_{\text{min}}} $$



where $X$ is the original value, $X'$ is the corresponding normalized value. The formula sets the max value in the dataset to 1, and sets the min value to 0. The rest of the data points are scaled to a value between 0-1 accordingly.


In [None]:
# normalize Brazil data
norm_brazil = (df_ft['Brazil'] - df_ft['Brazil'].min()) / (df_ft['Brazil'].max() - df_ft['Brazil'].min())

# normalize Argentina data
norm_argentina = (df_ft['Argentina'] - df_ft['Argentina'].min()) / (df_ft['Argentina'].max() - df_ft['Argentina'].min())

**Step 3**: Plot the data.

*   To plot two different scatter plots in one plot, we can include the axes one plot into the other by passing it via the `ax` parameter.
*   We will also pass in the weights using the `s` parameter. Given that the normalized weights are between 0-1, they won't be visible on the plot. Therefore, we will:
    *   multiply weights by 2000 to scale it up on the graph, and,
    *   add 10 to compensate for the min value (which has a 0 weight and therefore scale with $\times 2000$).


In [None]:
# Brazil
ax0 = df_ft.plot(kind='scatter',
                    x='Year',
                    y='Brazil',
                    figsize=(14, 8),
                    alpha=0.5,  # transparency
                    color='green',
                    s=norm_brazil * 2000 + 10,  # pass in weights 
                    xlim=(1975, 2015)
                    )

# Argentina
ax1 = df_ft.plot(kind='scatter',
                    x='Year',
                    y='Argentina',
                    alpha=0.5,
                    color="blue",
                    s=norm_argentina * 2000 + 10,
                    ax=ax0
                    )

ax0.set_ylabel('Number of Immigrants')
ax0.set_title('Immigration from Brazil and Argentina from 1980 to 2013')

ax0.legend(['Brazil', 'Argentina'], 
           loc='upper left', 
           fontsize='x-large')

plt.show()

* The size of the bubble corresponds to the magnitude of immigrating population for that year, compared to the 1980 - 2013 data. 
   * The larger the bubble is, the more immigrants are in that year.

> * a corresponding increase in immigration from Argentina during the 1998 - 2002 great depression 
> * a similar spike around 1985 to 1993.  
  

**Question**: Create bubble plots of immigration from China and India to visualize any differences with time from 1980 to 2013. 


Step 1: Normalize the data pertaining to China and India.


In [None]:
fav_countries = ['China', 'India']
df_ft = df.loc[fav_countries, years].transpose()
df_ft.index = map(int, df_t.index)
df_ft.index.name = 'Year'
df_ft.reset_index(inplace=True)
df_ft.head()

In [None]:
# normalized Chinese data
norm_china = (df_ft['China'] - df_ft['China'].min()) / (df_ft['China'].max() - df_ft['China'].min())


# normalized Indian data
norm_india = (df_ft['India'] - df_ft['India'].min()) / (df_ft['India'].max() - df_ft['India'].min())

Step 2: Generate the bubble plots.


In [None]:
# China
ax0 = df_ft.plot(kind='scatter',
                    x='Year',
                    y='China',
                    figsize=(14, 8),
                    alpha=0.5,                  # transparency
                    color='green',
                    s=norm_china * 2000 + 10,  # pass in weights 
                    xlim=(1975, 2015)
                    )

# India
ax1 = df_ft.plot(kind='scatter',
                    x='Year',
                    y='India',
                    alpha=0.5,
                    color="blue",
                    s=norm_india * 2000 + 10,
                    ax = ax0
                    )

ax0.set_ylabel('Number of Immigrants')
ax0.set_title('Immigration from China and India from 1980 - 2013')
ax0.legend(['China', 'India'], loc='upper left', fontsize='x-large')
plt.show()