# Setup

In [None]:
import pandas as pd

df = pd.DataFrame({
    'name': ["Tom", "Lisa", "Peter"],
    'height': [1.68, 1.93, 1.72],
    'weight': [48.4, 89.8, 84.2],
    'id': [1, 2, 3],
    'city': ['Stuttgart', 'Stuttgart', 'Berlin']
})

df['bmi'] = round(df['weight'] / (df['height'] * df['height']), 2)
df["name"] = df["name"].astype("category")
df['id'] = df['id'].astype(str)

# Numeric Data

## Mean

- We can calculate simple statistics like the *mean*



In [None]:
df['height'].mean()

In [None]:
df['height'].mean().round(2)

## Formatted string literals

- Print the value in nice format ([using formatted string literals](https://docs.python.org/3.6/reference/lexical_analysis.html#formatted-string-literals) f"...") 



In [None]:
print(f"The mean of height is {df['height'].mean():.2f}")

## Median and Standard Deviation

In [None]:
df['height'].median()

In [None]:
df['height'].std()

## Describe

- [describe()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html#pandas.DataFrame.describe) shows a quick statistic summary of your numerical data.




In [None]:
df.describe()

## Describe with transpose


In [None]:
df.describe().T.round(2)

## Describe for specific columns with groupby

- Summary statistics for numeric variables `height` and `bmi` for different levels of the categorical variable `city`:



In [None]:
df[['height', 'city']].groupby(['city']).describe().round(2).T

# Categorical Data

## Example

- we can also use `describe()` for categorical data



In [None]:
df.describe(include="category").T

## Show unique levels


- Show unique levels of a categorical variable and count with `value_counts()`



In [None]:
df['city'].value_counts()

## Extract specific values


- We also can extract specific values

In [None]:
df['city'].value_counts().Stuttgart

## Formatted string literals


- Print the value in nice format ([using formatted string literals](https://docs.python.org/3.6/reference/lexical_analysis.html#formatted-string-literals) f"...") 



In [None]:
count_stuttgart = df['city'].value_counts().Stuttgart

print(f"There are {count_stuttgart} people from Stuttgart in the data")

# Loop over List

## Statistics for specific columns


- Example of for loop to obtain statistics for specific numerical columns



In [None]:
# make a list of numerical columns
list_num = ['height', 'weight']

In [None]:
# calculate median for our list and only show 4 digits, then make a new line (\n)
for i in list_num:
    print(f'Median of {i} equals {df[i].median():.4} \n')

## Summary statistics

- Calculate summary statistics for our list.

In [None]:
for i in list_num:
    print(f'Column: {i}  \n  {df[i].describe().round(2)}   \n')   

# Create Plots

## Setup


In [None]:
# Pandas needs the module matplotlib to create plots
import matplotlib.pyplot as plt

# show plot output in Jupyter Notebook
%matplotlib inline

## One boxplot


In [None]:
df.boxplot(column=['weight']);

In [None]:
# obtain plots for our list
for i in list_num:
    df.boxplot(column=[i])
    plt.title("Boxplot for " + i)
    plt.show()