# Produisez une étude de marché

## Preamble

### Notebook config

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
from scipy import stats
import seaborn as sns
from cycler import cycler

### My functions

In [2]:
# Functions
def eta_squared(x,y):
    """Calculate the Eta Squared of two variables."""
    moyenne_y = y.mean()
    classes = []
    for classe in x.unique():
        yi_classe = y[x==classe]
        classes.append({'ni': len(yi_classe),
                        'moyenne_classe': yi_classe.mean()})
    SCT = sum([(yj-moyenne_y)**2 for yj in y])
    SCE = sum([c['ni']*(c['moyenne_classe']-moyenne_y)**2 for c in classes])
    return SCE/SCT

### My styles

In [4]:
plt.style.use('ggplot')

fig_size = [10.,6]
params = {'axes.labelsize': 13,
          'axes.prop_cycle': cycler('color', ['#0F5499', '#00994D', '#96CC28', '#CC0000', '#FF7FAA']),
#          'axes.facecolor': '#FFF1E5',
#          'axes.edgecolor': '#FFF1E5',
          'axes.titlesize': '14',
          'lines.linewidth': '2',
          'figure.titleweight': 'bold',
          'font.size':   14,
          'text.color': 'black',
          'legend.fontsize': 12,
          'xtick.labelsize': 11,
          'ytick.labelsize': 11,
          'axes.grid.axis': 'x',
          'grid.color': '#807973',
          'grid.linewidth': '1',
          'grid.alpha': '0.5',
          'text.usetex': False,
          # boxplots
          'boxplot.medianprops.color': 'black',
 #         'figure.facecolor': '#FFF1E5',
          'figure.figsize': fig_size,
}

plt.rcParams.update(params)

# plt.rcParams.keys()

## Chargement des données

In [115]:
temp = pd.read_csv("Total_population.csv")
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 472 entries, 0 to 471
Data columns (total 15 columns):
Domain Code         472 non-null object
Domain              472 non-null object
Area Code           472 non-null int64
Area                472 non-null object
Element Code        472 non-null int64
Element             472 non-null object
Item Code           472 non-null int64
Item                472 non-null object
Year Code           472 non-null int64
Year                472 non-null int64
Unit                472 non-null object
Value               472 non-null float64
Flag                472 non-null object
Flag Description    472 non-null object
Note                4 non-null object
dtypes: float64(1), int64(5), object(9)
memory usage: 55.4+ KB


In [116]:
countries = temp[['Area', 'Year', 'Value']]
countries.columns = ['country', 'year', 'population']
countries = countries.pivot_table(index=['country'], columns=['year'], values=['population'])
countries = countries.xs('population', axis=1, drop_level=True)
countries.columns = ['2016', '2017']
countries['population_growth'] = (countries['2017'] - countries['2016'])/countries['2017'] * 100
countries = countries[['population_growth']]

In [117]:
temp = pd.read_csv("GDP_per_capita-2017.csv")
temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 15 columns):
Domain Code         209 non-null object
Domain              209 non-null object
Area Code           209 non-null int64
Area                209 non-null object
Element Code        209 non-null int64
Element             209 non-null object
Item Code           209 non-null int64
Item                209 non-null object
Year Code           209 non-null int64
Year                209 non-null int64
Unit                0 non-null float64
Value               209 non-null float64
Flag                209 non-null object
Flag Description    209 non-null object
Note                0 non-null float64
dtypes: float64(3), int64(5), object(7)
memory usage: 24.6+ KB


In [118]:
gdp = temp[['Area', 'Value']]
gdp.columns = ['country', 'gdp_per_capita']
gdp.set_index('country')

Unnamed: 0_level_0,gdp_per_capita
country,Unnamed: 1_level_1
Afghanistan,618.989990
Albania,4450.008107
Algeria,4055.246580
Andorra,39152.812074
Angola,4247.411049
Anguilla,18860.693789
Antigua and Barbuda,14803.010927
Argentina,14399.621069
Armenia,3936.798582
Aruba,25655.102022


In [130]:
df = countries.copy()
df = df.merge(gdp, on=['country'], how='outer')
df[df.isna().any(axis=1)]['country'].tolist()

['American Samoa',
 'Bonaire, Sint Eustatius and Saba',
 'Channel Islands',
 'China, Taiwan Province of',
 'Curaçao',
 'Falkland Islands (Malvinas)',
 'Faroe Islands',
 'French Guiana',
 'Gibraltar',
 'Guadeloupe',
 'Guam',
 'Holy See',
 'Isle of Man',
 'Martinique',
 'Mayotte',
 'Netherlands Antilles (former)',
 'Niue',
 'Northern Mariana Islands',
 'Réunion',
 'Saint Barthélemy',
 'Saint Helena, Ascension and Tristan da Cunha',
 'Saint Pierre and Miquelon',
 'Saint-Martin (French Part)',
 'Sint Maarten (Dutch Part)',
 'Tokelau',
 'United States Virgin Islands',
 'Wallis and Futuna Islands',
 'Western Sahara',
 'Kosovo']