# Statistics Introduction Applied to Data Science
## Lab : Seven
## Exploratory Data Analysis - Part Two

In [None]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.style.use('ggplot') #optional: for ggplot-like style

# To check the latest version of Matplotlib library.
print('Matplotlib version:', mpl.__version__) # >= 2.0.0

In [None]:
import numpy as np
import pandas as pd

In [None]:
df_fat = pd.read_csv('data/Fat_Supply_Quantity_Data.csv')

In [None]:
df_fat.head()

In [None]:
# let's examine the types of the column labels
# We examine the data type of the column names and verify that they are type text (str)
all(isinstance(column, str) for column in df_fat.columns)

In [None]:
df_fat.shape

In [None]:
# We delete the empty rows of the Obesity variable, using dropna.
df_fat.dropna(subset=["Obesity"], axis=0, inplace = True)
df_fat.shape

In [None]:
# We make the country as the index of the dataframe to facilitate searches with loc method.
df_fat.set_index('Country', inplace = True)
df_fat.head()

In [None]:
df_fat['Obesity'].head(10)

## Histogram

In [None]:
# An easy way to create the histogram.
df_fat['Obesity'].plot.hist()

In [None]:
# np.histogram returns 2 values as lists. 
# The first list contains count or frequency of the variable.
# The second list contains the class ranges.

count, bin_edges = np.histogram(df_fat['Obesity'])

print(count) # Number of frequencies
print(bin_edges) # class range, by default: 10 bins

In [None]:
# With the parameter kind = 'hist' we indicate that we want to graph a histogram.
# With the figsize parameter we indicate the size of the graph.
# In the xticks parameter we indicate our class range. Notice that bin_edges is the second value returned by np.histogram.
# With the title method we specify a title to the chart.
# With the ylabel and xlabel methods we specify titles to the axes.

df_fat['Obesity'].plot(kind='hist', figsize=(8,5), xticks=bin_edges)
plt.title('Obesity Index Histogram on 167 countries')
plt.ylabel('Countries')
plt.xlabel('Obesity Index')

plt.show()

### Applying empirical criteria to determine class number and rank

In [None]:
# From 1 to 16
for i in range(1,16):
    class_count = i
    # If the value of 2 raised to i is greater than the number of observations, we are done.
    if 2 ** i > df_fat.shape[0]:
       break

# We apply the formula to calculate the class range.
class_range = (df_fat['Obesity'].max() - df_fat['Obesity'].min())/class_count

# print results.
print('class count: ', class_count)
print('class: range ', class_range)

# Minimum and Maximum Obesity Index.
print('Minimum: ', df_fat['Obesity'].min())
print('Maximum: ', df_fat['Obesity'].max())

### We create our own class range

In [None]:
# We adjust the number of classes.
num_bin = 9

# We adjust the class range
class_range = 5

# In this list we save the class ranges.
new_bin = []

# In this list we keep the number of countries.
ncount = []

# Initializing range values.
begin_value = 0
end_value = 2
new_bin.append(end_value)

# Cycle to get class range.
for i in range(num_bin):
    # Increments the final value by the value of the class range.
    end_value += class_range
    
    # Save the new range to the class list.
    new_bin.append(end_value)

    # We count the number of countries within the class range.
    ncount.append(df_fat[(df_fat['Obesity'] >= begin_value) & (df_fat['Obesity'] < end_value)].shape[0])
    
    # The new starting value is the ending value.
    begin_value = end_value

print('Countries count: ', ncount)
print('Class count:', new_bin)

### We use our custom class range to build the Histogram.

In [None]:
# In parameters bins y xticks we specify our custom class range "new_bin".

count, bin_edges = np.histogram(df_fat['Obesity'], num_bin)

df_fat['Obesity'].plot(kind = 'hist',
                       figsize = (8,5),
                       bins = new_bin,
                       xticks = new_bin,
                       color ='mediumseagreen'
                      )

plt.title('Obesity Index Histogram on 167 countries')
plt.ylabel('Countries')
plt.xlabel('Obesity Index')

plt.show()

Notice that the x-axis labels correspond to our custom class range. Some symmetry is seen in the distribution of the obesity rate if we compare the index ranges from 12 to 37.

The largest group corresponds to countries that maintain an obesity rate between 22 and 27. According to the BMI classification table, this group made up of 47 countries is in a degree of pre-obesity. This is not very good news for health systems in general.


In [None]:
# Let's explore the column names of our dataframe
df_fat.columns.values

In [None]:
# We can build histogram with multiple variables.
df_fat[['Cereals - Excluding Beer','Meat', 'Milk - Excluding Butter']].plot(kind='hist', figsize=(10,6))

plt.title('Fat Supply Quantity Data by Foods')
plt.ylabel('Countries')
plt.xlabel('Fat Suppy')

plt.show()

In [None]:
# We define a new dataframe called df_three with our three variables of interest.
# With np.histogram we obtain the frequencies and the class range.

df_three = df_fat[['Cereals - Excluding Beer','Meat','Milk - Excluding Butter']]
count, bin_edges = np.histogram(df_three, 10)

#un-staked histogram
df_three.plot(kind = 'hist',
             figsize=(13, 8),
             bins = 10,
             alpha=0.6,
             xticks=bin_edges,
             color=['Orange','darkslateblue','mediumseagreen']
            )

plt.title('Fat Supply Quantity Data by three Foods')
plt.ylabel('Countries')
plt.xlabel('Fat Supply')

plt.show()

Both cereals and milk show a positive asymmetric distribution, which indicates that the fat intake from these sources is very low. Notice how different is the distribution of fat from meat, which on average contributes between 5.43 and 10.68 percent, in around 80 countries.

### Practice One:

Construct a frequency histogram with the variables: Eggs and Fish, Seafood.
What can be said about their distributions?

In [None]:
# Type your code here


## Pearson's correlation coefficient and p value.

We need the scipy stats library for the correlation coefficient.

In [None]:
from scipy import stats

### Let's explore some relationships of sources fat vs. obesity.

Meat vs Obesity.

In [None]:
pearson_coef, p_value = stats.pearsonr(df_fat['Meat'], df_fat['Obesity'])
print("Pearson's correlation coefficient: ", pearson_coef, " p value: ", p_value)

if p_value < 0.001:
    print("High certainty")
elif p_value < 0.05:
    print("Moderate certainty")
elif p_value < 0.1:
    print("Low certainty")
else:
    print("Lack of certainty")


Animal Products vs Obesity.

In [None]:
pearson_coef, p_value = stats.pearsonr(df_fat['Animal Products'], df_fat['Obesity'])
print("Pearson's correlation coefficient: ", pearson_coef, " p value: ", p_value)

if p_value < 0.001:
    print("High certainty")
elif p_value < 0.05:
    print("Moderate certainty")
elif p_value < 0.1:
    print("Low certainty")
else:
    print("ALack of certainty")


To analyze several coefficients it is better to construct a correlation matrix. We do this with corr() method

In [None]:
df_fat[['Animal Products','Meat','Cereals - Excluding Beer','Sugar & Sweeteners','Obesity']].corr()

We can import the pandas scatter_matrix library to obtain the same correlation matrix but graphically. The relationship between variables is shown with a scatter plot. The relationship between the same variable is shown with a frequency histogram. This allows us to analyze its shape.

In [None]:
from pandas.plotting import scatter_matrix

In [None]:
scatter_matrix(df_fat[['Animal Products','Meat','Cereals - Excluding Beer','Sugar & Sweeteners','Obesity']], figsize = (17,14))
plt.show()

We therefore see the distributions of each variable in a single graph and we can quickly detect if any pattern or trend is observed when we cross their values using the scatter plot. The most interesting correlations, whether we look at the correlation matrix or the scatter matrix, are between the variables: Obesity with Cereals and Meat with Animal Products.

## Regression Plot

To obtain the regression plot easy and fast we use the Seaborn library. Let's make the regression plot of the most strongly correlated variables of the set of variables identified in the previous analysis.

In [None]:
import seaborn as sns
%matplotlib inline

Animal Products vs. Meat.

In [None]:
sns.regplot(x = "Animal Products", y = "Meat", data = df_fat)

Cereals vs. Obesity.

In [None]:
sns.regplot(x = "Cereals - Excluding Beer", y = "Obesity", data = df_fat, color = 'mediumseagreen')
plt.ylim(0,)

Code to increase the size of the regression plot.

In [None]:
plt.figure(figsize = (12, 8))
ax = sns.regplot(x = "Cereals - Excluding Beer", y = "Obesity", data = df_fat, color = 'mediumseagreen')
plt.ylim(0,)

### Practice Two:
Build:

* Correlation matrix with the variables Eggs, Fish Seafood and Obesity.
* Regression plot with the variables Eggs and Obesity.
* Regression plot with the variables Fish, Seafood and Obesity.
* Which of the two sources of fat is more correlated with Obesity?

Remember, obesity is the dependent variable.

In [None]:
# Type your code here


We load a new dataset for the following analyzes. The new dataset corresponds to the suicides that occurred worldwide, from 1987 to 2014.

In [None]:
df_suicide = pd.read_csv('data/suicide.csv')

In [None]:
df_suicide.head()

<h3>Analysis of Variance (ANOVA)</h3>

<p>Analysis of variance, as we have seen, is a statistical method that can be used to verify whether there are significant differences between the means of two or more groups. Analysis of variance returns two values: </p>

<p><b>F statistic</b>: ANOVA assumes that the means of all groups are the same until proven otherwise. Then calculate how much the real means deviate from the assumption. This is indicated by the F statistic. The larger the value of F, the greater the difference between the means.</p>

<p><b>p value</b>: p value tells us how statistically significant the F value is.</p>

<p>We will do an analysis of variance with the variables generation and number of suicides. The objective of the test is to determine if there is any correlation between generational groups and the number of suicides.</p>

<p>If our generation variable is strongly correlated with the number of suicides, the ANOVA test will return a high score in the F statistic and a small p value.</p>

In [None]:
# We take only the variables generation and suicides_no.
df_suic_group = df_suicide[['generation','suicides_no']]
df_suic_group.head()

In [None]:
# A brief summary of the generations
df_suic_group.groupby(['generation'], as_index = False).count()

First we group by the categorical variable generation.

In [None]:
group_df_suicide = df_suicide[['generation', 'suicides_no']].groupby(['generation'])
group_df_suicide.head(2)

We can get the values of the groups using the get_group method.
So, for instance if I want the group values Millenials:

In [None]:
group_df_suicide.get_group('Millenials')['suicides_no']

Now that we know how to create the groups and obtain their values, we are going to use the f_oneway method of the stats library to perform our analysis of variance. Let's do it for all generations.


In [None]:
f_val, p_val = stats.f_oneway(group_df_suicide.get_group('Boomers')['suicides_no'], 
                              group_df_suicide.get_group('G.I. Generation')['suicides_no'], 
                              group_df_suicide.get_group('Generation X')['suicides_no'],
                              group_df_suicide.get_group('Generation Z')['suicides_no'],
                              group_df_suicide.get_group('Millenials')['suicides_no'], 
                              group_df_suicide.get_group('Silent')['suicides_no'])
 
print( "ANOVA Result: F=", f_val, ", P =", p_val)


A very interesting result. It turns out that generational groups are strongly correlated with the incidence of suicide worldwide in the years of the study. But will it be all or just some of them?


### Practice Three:

Identify among all generational groups:
* Which of them is more correlated with the incidence of suicides?
* Which of them is least correlated?
* What is the significance for these?


In [None]:
# Type your code here
