# Statistics Introduction Applied to Data Science
## Lab : Six
## Exploratory Data Analysis - Part One

### We import necessary libraries.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Standard python graphics library.
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# For the boxplot
!pip install seaborn

We load the operations dataset.

In [None]:
df_operations = pd.read_csv('data/Operations.csv')

In [None]:
df_operations.head()

## Describe() method
By default it only shows quantitative variables.

In [None]:
df_operations.describe()

If we want to show only qualitative variables we use the include parameter.

In [None]:
df_operations.describe(include = [np.object])

If I want to see both numeric and categorical variables, include = 'all'.

In [None]:
df_operations.describe(include = 'all')

## Value_counts() method
We can summarize categorical variables.

In [None]:
df_cus_type_counts = df_operations["Customer Type"].value_counts()
df_cus_type_counts = df_cus_type_counts.to_frame()
df_cus_type_counts

To improve readability.

In [None]:
df_cus_type_counts.rename(columns={'Customer Type':'Count'}, inplace = True)
df_cus_type_counts.index.name = 'Customer Type'
df_cus_type_counts

## Groupby() method

In [None]:
df_test = df_operations[['Customer Type','Payment Type','Sales']]
df_test.groupby(['Customer Type','Payment Type'], as_index = False).mean()

## Pivot() method
To improve the readability of grouped data.

In [None]:
df_grp = df_test.groupby(['Customer Type','Payment Type'], as_index = False).mean()
df_grp.pivot(index = 'Customer Type', columns = 'Payment Type')

## Heatmap
Notice that the heatmap is built from the pivot of the grouped data.

In [None]:
df_pivot = df_grp.pivot(index = 'Customer Type', columns = 'Payment Type')
plt.pcolor(df_pivot, cmap = 'RdBu')
plt.colorbar()
plt.show()

We customize the Heatmap for a better presentation.

In [None]:
fig, ax = plt.subplots()
im = ax.pcolor(df_pivot, cmap='RdBu')

#Label names
row_labels = df_pivot.columns.levels[1]
col_labels = df_pivot.index

#move ticks and labels to the center
ax.set_xticks(np.arange(df_pivot.shape[1]) + 0.5, minor=False)
ax.set_yticks(np.arange(df_pivot.shape[0]) + 0.5, minor=False)

#Insert labels
ax.set_xticklabels(row_labels, minor=False)
ax.set_yticklabels(col_labels, minor=False)

#Rotate label if too long
plt.xticks(rotation=90)

fig.colorbar(im)
plt.show()

## Box Plots
Using the standard library.

In [None]:
#We only take the variable sales.
df_oper_sales = df_operations.loc[:,'Sales']

df_oper_sales.plot(kind='box', figsize=(3,7))
plt.title('Box Plot')
plt.show()

Using the seaborn library. A box plot of sales by continent.

In [None]:
sns.boxplot(x="Continent", y="Sales", data=df_operations)

### Practice One
Use the value_counts() method to summarize the "Payment Type" categorical variable.

In [None]:
# Type your code here


### Practice Two
Build a Heatmap to see how "Sales" are related to "Customer Type" and "Country".

In [None]:
# Type your code here


### Practice Three
Use the Seaborn library to build a multiple box plots where we can compare the distribution of "Sales" by "Payment Type".

In [None]:
# Type your code here
