**Setup** <br>
Uncomment and run the lines below if you have not installed these packages yet:

In [None]:
#!pip install pandas
#!pip install matplotlib
#!pip install seaborn
#!pip install -U pandas-profiling

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

from pandas_profiling import ProfileReport

# Exercise 3: Data exploration

In [None]:
# Download: https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

# Load DataFrame
diabetes = pd.read_csv('diabetes.csv')
diabetes.head()

In [None]:
# Create report
profile = ProfileReport(diabetes, title="Pandas Profiling Report")
profile.to_notebook_iframe()

<br>
<br>

# Exercise 4: Visualization

## 1) Heatmap
**Diverging colormaps**: RdBu, BrBG, PuOr, coolwarm, ...

In [None]:
# Compute correlation matrix (Pearson correlation)
correlation_matrix = diabetes.corr()
correlation_matrix

In [None]:
# Plot heatmap


In [None]:
# load DataFrame
flights = sns.load_dataset('flights')
flights.head()

**Visualize #passengers per month and year**

In [None]:
# "Pivot" the DataFrame such that we get #passengers for each month & year
flights_2D = flights.pivot(index='month', columns='year', values='passengers')
flights_2D

In [None]:
# Plot a heatmap


In [None]:
# Plot heatmap with a suitable colormap


# Rotate labels



plt.show()

**More colormaps:**
* _Sequential colormaps:_ Blues, Greens, Greys, ...
* _Perceptually uniform sequential colormaps:_ viridis, cividis, magma, plasma, inferno

Further information: https://matplotlib.org/stable/tutorials/colors/colormaps.html

<br>

## 2) Lineplot

**Trend of #passengers over the years?**

In [None]:
months = flights['month'].unique()
months

In [None]:
for month in months:
    flights_current_month = flights[flights['month'] == month]
    
    # Line plot for current month (x=year, y=passengers)

    
plt.legend()

**Trend of average #passengers over the years?**

In [None]:
# Compute mean passengers per year
mean_passengers = flights.groupby('year')['passengers'].mean()
mean_passengers

In [None]:
# Compute standard deviation of passengers per year
std_passengers = flights.groupby('year')['passengers'].std()
std_passengers

In [None]:
# Line plot with errorbars (mean +/- standard deviation)


In [None]:
# Line plot with error bands (mean +/- standard deviation)


plt.xlabel('year')
plt.ylabel('passengers')

**Using seaborn:**

In [None]:
# plot line for each month separately, or with error bars/bands


<br>

## 3) Histogram & boxplot (age distribution)

In [None]:
# Download: https://www.kaggle.com/datasets/parulpandey/2020-it-salary-survey-for-eu-region

# load DataFrame
df = pd.read_csv('IT Salary Survey EU  2020.csv')
df.head()

In [None]:
# Remove NaN's from column 'Age'
df = df[~df['Age'].isnull()]

In [None]:
# Create suitable plot for visualizing the age distribution


In [None]:
# Plot


# Compute average age

print(mean_age)

# Highlight average age by plotting a vertical line


In [None]:
# Boxplot
df["Age"].plot.box()

# Seaborn boxplot
# sns.boxplot(df, x='Age')

In [None]:
# Plot histogram & boxplot together using plt.subplots()
fig, (ax_boxplot, ax_hist) = plt.subplots(nrows=2,
                                          gridspec_kw={"height_ratios": (.2, .8)})

# Boxplot

# Histogram


# Remove chart borders



# Remove duplicate axis label and ticks
ax_boxplot.set_xlabel('')
ax_boxplot.set_yticks([])
ax_boxplot.set_xticks([])

In [None]:
# Set seaborn style {darkgrid, whitegrid, dark, white, ticks}
sns.set_style('ticks')

<br>

## 4) Box-, violin-, strip-, and swarmplot (salary distribution)

In [None]:
df.columns

In [None]:
# Rename column
df = df.rename(columns={'Yearly brutto salary (without bonus and stocks) in EUR': 'Yearly brutto salary in EUR'})

In [None]:
# Boxplot of salary distribution
sns.boxplot(df, y='Yearly brutto salary in EUR')

**Remove outliers (as shown in last week's exercise)**

In [None]:
df['Yearly brutto salary in EUR'].describe()

In [None]:
# Q3 + whis*(Q3 - Q1)
upper_limit = 80000 + 1.5*(80000 - 58800)

# Q1 - whis*(Q3 - Q1)
lower_limit = 58800 - 1.5*(80000 - 58800)

# Apply limits to DataFrame
df = df[(df['Yearly brutto salary in EUR'] <= upper_limit) &
        (df['Yearly brutto salary in EUR'] >= lower_limit)]

In [None]:
# boxplot
sns.boxplot(df, y='Yearly brutto salary in EUR')

**Is there a difference in salary between genders?**

In [None]:
df['Gender'].unique()

In [None]:
# Remove NaN's from column "Gender"
df = df[~df['Gender'].isnull()]

In [None]:
# Plot actual data points with one color for each gender


In [None]:
# Boxplots of salary by gender


# Plot data points


sns.despine()

In [None]:
# Violinplots of salary by gender


# PLot data points using swarmplot


sns.despine()

## 5) Bar plot

**How many data scientists are there?**

In [None]:
df = df.rename(columns={'Position ': 'Position'})

In [None]:
position_counts = df['Position'].value_counts()
position_counts

In [None]:
# Bar plot with count per Position


In [None]:
# Only consider top 10 positions


In [None]:
# Bar plot


# Rotate tick labels
# plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
# Sort bars
position_counts = position_counts.sort_values()

# Horizontal bar plot (because of long labels)


plt.show()

In [None]:
# Create custom colors with Data Scientists in blue


In [None]:
# Horizontal bar plot with custom colors
bars = plt.barh(y=position_counts.index, width=position_counts.values,
                color=custom_colors)

# Add axis labels
plt.xlabel('Count')
plt.ylabel('Position')

# Add count next to each bar


# Remove chart border & y-axis (declutter!)
sns.despine(bottom=False, left=True)
plt.tick_params(left=False)
plt.xlim([0, 400])

plt.show()

## 6) Save plots for editing in Inkscape

Uncomment next line to install nutil package (see: https://github.com/anki-xyz/nutil)

In [None]:
#!pip install git+https://github.com/anki-xyz/nutil

In [None]:
from nutil.plot import paperStyle

In [None]:
# Use "paperStyle" to define style for publication (scientific paper, thesis, ...)



# Plot histogram & boxplot together using plt.subplots()
fig, (ax_boxplot, ax_hist) = plt.subplots(nrows=2,
                                          figsize=(6.5, 2),
                                          gridspec_kw={"height_ratios": (.2, .8)})

sns.boxplot(df, x='Age', ax=ax_boxplot)
sns.histplot(df, x='Age', bins=49, ax=ax_hist)

# Remove chart borders
sns.despine(ax=ax_boxplot, left=True, bottom=True)
sns.despine(ax=ax_hist, trim=True)
ax_boxplot.set_xlabel('')
ax_boxplot.set_yticks([])
ax_boxplot.set_xticks([])

# Save


Save the other two plots:

In [None]:
with paperStyle(font_size=8):
    plt.figure(figsize=(2.5, 3))
    sns.boxplot(df, x='Gender', y='Yearly brutto salary in EUR',
                palette='pastel', width=0.4, fliersize=2)
    sns.stripplot(df, x='Gender', y='Yearly brutto salary in EUR', size=2, hue='Gender', legend=False)
    sns.despine()
    
    plt.savefig('files/salary_by_gender.svg', bbox_inches='tight')

In [None]:
with paperStyle(font_size=8):
    plt.figure(figsize=(3,3))
    # Horizontal bar plot with custom colors
    bars = plt.barh(y=position_counts.index, width=position_counts.values,
                    color=custom_colors)

    # Add axis labels
    plt.xlabel('Count')
    plt.ylabel('Position')

    # Add count next to each bar
    plt.bar_label(bars, padding=5, fontsize=8)

    # Remove chart border & y-axis (declutter!)
    sns.despine(bottom=False, left=True)
    plt.tick_params(left=False)
    plt.xlim([0, 400])
    
    plt.savefig('files/positions.svg', bbox_inches='tight')