# ISYS2407 Information Systems Solutions & Design

# Exploratory Data Analysis

In [None]:
!pip install joblib
!pip install seaborn
!pip install scikit-learn 
!pip install imblearn

# 1. Import libraries

In [None]:
# Pandas is the main library used for exploratory data analysis
# Pandas is built on top of numpy 
# Numpy allows fast advanced mathematical operations on large amount of data
import numpy as np
import pandas as pd

# Library to assign missing values - not used here, better do it in EDA
from sklearn.impute import SimpleImputer

# Matplotlib is the main plotting library for python
# Seaborn is built on top of matplotlib and it's easier to use
# pandas also have same basic plotting capabilities which are again built on top of matplotlib
import matplotlib.pyplot as plt
# The next statement instructs matplotlib to plot the graphs inside the notebook
%matplotlib inline 
import seaborn as sns

# Another useful plotting library is plotly - it is used for interactive plots

# 2. Load the data

In [None]:
# Load the data - assuming its stored on your computer
# diabetes_df = pd.read_csv("diabetes.csv") # the suffix _df stands for dataframe
#diabetes_df = pd.read_csv("data/diabetes.csv") # if the CSV file is in folder data

# If you suspect missing values are coded in non-standard ways
#missing_values = ['?', '--', ' ', 'NA', 'N/A', '-'] # you can add more in the list
#diabetes_df = pd.read_csv("diabetes.csv", na_values=missing_values)

# If delimiter is not a comma
#diabetes_df = pd.read_csv("diabetes.csv", delimiter=';')

In [None]:
# Load the data from the web
url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.csv'
col_names = ['num_pregnancies', 
             'glucose', 
             'blood_pressure', 
             'skin_thickness', 
             'insulin', 
             'bmi', 
             'pedigree', 
             'age', 
             'outcome']
diabetes_df = pd.read_csv(url, header=None, names=col_names)


In [None]:
# What is the data type?
#type(diabetes_df)
# or
print(type(diabetes_df))

In [None]:
diabetes_df.info()

# 3. Data overview

In [None]:
# Print the number of rows and columns (as done before)
# We are here using the python3 f-string which is a lot easier to use
# The format is f""
# Inside the single or double quotation marks, you can have any combination of strings and variables
# Variables are enclosed within curly brackets {}
# The f-string dosumentation can be found at https://www.python.org/dev/peps/pep-0498/
print(f"Rows     : {diabetes_df.shape[0]}")
print(f"Columns  : {diabetes_df.shape[1]}" ,)
print()

# Print the column names
print(f"Features : {diabetes_df.columns.tolist()}")
print()

# Print the total number of null values in the data
# Null values are - NaN (not a number), or blank spaces
# 0 is not a null value - it's a valid value
print(f"Missing values :  {diabetes_df.isnull().sum().values.sum()}")

# For each column, print the number of unique values
# 2 unique values in column - binary categorical variable e.g. male/female, yes/no, etc
# less than 6 values in columns - still a categorical value but with more than 2 classes
# large number of values - continuous variable
print(f"Unique values :  {diabetes_df.nunique()}") 
# the number of unique values. 
#large number -> more likely to be continuous variables | small number -> be independant variables
# none:= 1, yes:=0


In [None]:
diabetes_df.describe().T # T means transpose rows to columns

# 4. Correlations

How strong is the relationship between two values

a. Using the dataframe's corr()  function

In [None]:
# 1. Use the corr() function from the dataframe to compute the correlations and display as text
corr = diabetes_df.corr()
print(type(corr))
print(corr)

b. Using Seaborn's pairplot() function

In [None]:
# SLOW! 
# Takes a few minutes to plot (for the diabetes dataset) and might get warnings if problems with the data
# Dead slow for large datasets - you might want to skip it and use alternative methods

# 2. Using seaborn's pairplot() to plot the correlations as scatter plots and regression lines
# The seaborn pairplot documentation can be found 
# at https://seaborn.pydata.org/generated/seaborn.pairplot.html
sns.set(style='ticks')
#plt.figure(figsize=(8,16)) #figsize(width, height) in inches - default(6.4, 4.8) - customise as required
sns.pairplot(diabetes_df, kind="reg") # kind="reg" means plot the regression line as well

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('correlation-pairplot.png', dpi=300) # dpi=300 is print quality

# The histograms shown on the diagonals are the frequency distributions of the values in each column
# The order is: 'num_pregnancies', 'glucose', 'blood_pressure', 'skin_thickness', 
# 'insulin', 'bmi', 'pedigree', 'age', 'outcome'

# And the other cells show the correlations between pairs of variables
# Example in the first row, the y axis is Pregnancies
# The variables on the x-axis can be seen at the bottom of the graph
# e.g. col2=glucose, col3=blood_pressure, etc
# The strength of the correlation depends on the slope of the regression line

c. Using Seaborn's heatmap() function

In [None]:
# 3. Using seaborn to plot the correlations as a heatmap
# Ref: https://seaborn.pydata.org/generated/seaborn.heatmap.html
plt.rcParams['figure.figsize'] = (20, 15) #figsize(width, height) inches, default(6.4, 4.8), customise as required
sns.heatmap(diabetes_df.corr(), annot = True, linewidths=.5, cmap="YlGnBu") # YellowGreenBlue. color -> : strongly correlated
plt.title('Correlation between features', fontsize = 30)
#plt.show() # Do not use this if you want to save to file

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('correlation-heatmap-rectangular.png', dpi=300) # dpi=300 is print quality

# The colours show the strength of the correlations - from yellow (weak) to dark blue (strong)

# Cells on the diagonal are dark blue (corr=1) - correlation of the variable with itself

Since the rectangle diagram above contains duplicated result of top & bottom and left & right, 
it is recommended to display the half of only non-duplicated databset

In [None]:
# Changing the shape of the matrix from rectangular to triangular
# https://heartbeat.fritz.ai/seaborn-heatmaps-13-ways-to-customize-correlation-matrix-visualizations-f1c49c816f07
matrix = np.triu(diabetes_df.corr())
sns.heatmap(diabetes_df.corr(), annot=True, mask=matrix,  cmap="YlGnBu")

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('correlation-heatmap-traingular.png', dpi=300) # dpi=300 is print quality

# 5. Visualizations

b. What is the percentage of diabetics in the sample?

because the data we are going to represent contains only value of 0 and 1.

In this case, be sure that the unbalanced data could be due to skewness & unbalance of dataset. 

In [None]:
# Plot a pie chart
# Get the labels from the "outcome" column - use keys() function to get the labels
#labels = diabetes_df["outcome"].value_counts().keys().tolist() # [0,1] not meaningful
labels = ["Healthy", "Diabetic"]
print(f"labels: {labels}")

# Get the values from the "outcome" column - use values to get the values
values = diabetes_df["outcome"].value_counts().values.tolist()
print(f"values: {values}")

# Use matplotlib to draw a simple pie chart
# Ref: https://matplotlib.org/3.1.1/api/_as_gen/matplotlib.pyplot.pie.html
plt.figure(figsize=(6,6)) #figsize(width, height) in inches - default(6.4, 4.8) - customise as required
plt.pie(values, labels=labels, autopct='%1.1f%%', shadow=True)
plt.title("Proportion of diabetics in sample", fontsize=20) # use fontsize=20 if title is too small

# If the fonts are too small with the default settings, you can adjust them as required
#plt.rc('font', size=10)        # adjust default text sizes (if required)
#plt.rc('axes', labelsize=10)  # adjust fontsize of labels on x/y axes (if required)
#plt.rc('axes', titlesize=10)  # adjust fontsize of titles on x/y axes (if required)
#plt.rc('xtick', labelsize=10) # adjust fontsize of tick labels on x axis (if required)
#plt.rc('ytick', labelsize=10) # adjust fontsize of tick labels on y axis (if required)
#plt.rc('legend', fontsize=10) # adjust fontsize of legend (if required)
#plt.rc('figure', titlesize=30) # adjust fontsize of legend (if required)

#plt.show() # Do not use this is you want to save to file - the file will be empty

# Save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('proportion-diabetics.png', dpi=300)

c. Split the data into non-diabetics and diabetics

In [None]:
# Non-diabetics
non_diabetics_df = diabetes_df[diabetes_df["outcome"]==0]
non_diabetics_df.head()

In [None]:
# Diabetics
diabetics_df = diabetes_df[diabetes_df["outcome"]==1]
diabetics_df.head()

# 6. Visualisation of continuous variables

a. Impact of glucose on diabetes

In [None]:
# Plot frequency distributions of non-diabetics and diabetics
# Ref:https://seaborn.pydata.org/generated/seaborn.distplot.html

plt.figure(figsize=(12,8)) # figsize(width, height) in inches - default(6.4, 4.8)

# Histogram with absolute numbers on y-axis (kde=False - default)
#sns.histplot(data=non_diabetics_df, x="glucose", color="skyblue", label="Non-Diabetics")
#sns.histplot(data=diabetics_df, x="glucose", color="red", label="Diabetics")

# Histogram with density on y-xis (kde=True)
sns.histplot(data=non_diabetics_df, x="glucose", color="skyblue", label="Non-Diabetics", kde=True)
sns.histplot(data=diabetics_df, x="glucose", color="red", label="Diabetics", kde=True)

plt.legend()
plt.title('Impact of glucose levels on diabetes', fontsize = 30)
plt.xlabel("Glucose levels (mg)") # **be sure to add units!!
plt.ylabel("Density")

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('glucose-diabetes.png', dpi=300) # dpi=300 is print quality

b. Impact of BMI on diabetes

In [None]:
# Plot frequency distributions of non-diabetics and diabetics
plt.figure(figsize=(12,8)) # figsize(width, height) in inches - default(6.4, 4.8)

sns.histplot(data=non_diabetics_df, x="bmi", color="skyblue", label="Non-Diabetics", kde=True)
sns.histplot(data=diabetics_df, x="bmi", color="red", label="Diabetics", kde=True)

plt.title('Impact of BMI on diabetes', fontsize = 30)
plt.xlabel("BMI")
plt.ylabel("Density")
plt.legend()

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('bmi-diabetes.png', dpi=300) # dpi=300 is print quality

c. Impact of age on diabetes


In [None]:
# Plot frequency distributions of non-diabetics and diabetics
plt.figure(figsize=(12,8)) # figsize(width, height) in inches - default(6.4, 4.8)

sns.histplot(data=non_diabetics_df, x="age", color="skyblue", label="Non-Diabetics", kde=True)
sns.histplot(data=diabetics_df, x="age", color="red", label="Diabetics", kde=True)

plt.title('Impact of age on diabetes', fontsize = 30)
plt.xlabel("Age (years)")
plt.ylabel("Density")
plt.legend()

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('age-diabetes.png', dpi=300) # dpi=300 is print quality

# 5.2. Visualisation of categorical variables

In [None]:
# When checking the number of unique values, the blood pressure column was found to contain 47 values
# This is evidence that it's a continuous variable
# Classification algorithms do not work well with continuous values - too many values
# No need to categorise them. How? Need some domain knowlege
# less than 80 = normal, between 80-89 = high, greater than 89 = very high
# Define a function to classify this
# Pass the whole dataframe as parameter
# And return a category
def bp_category(df): 
    if df["blood_pressure"] <= 80:
        return "bp_normal"
    elif (df["blood_pressure"] > 80) & (df["blood_pressure"] <= 89):
        return "bp_high"
    elif df["blood_pressure"] > 89:
        return "bp_very_high"

# Use apply() to apply a lambda function for all the rows of the dataframe
# The lambda function calls the previously defined function bp_category()
# To which it passes the diabetes_df as parameter
# And a category is returned and saved in a new column named "blood_pressure_category"
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.apply.html
# the backslash is a line continuation character - there should be nothing after it, not even a space
diabetes_df["blood_pressure_category"] = \
    diabetes_df.apply(lambda diabetes_df:bp_category(diabetes_df), axis = 1) # axis=1 = row-wise
# Check
diabetes_df.head()
# This works as there's a new column named "blood_pressure_category"
# The column "blood_pressure" is no longer needed and hence can be deleted 

# This example has shown how to add a column using apply() and a lambda function
# There are other ways to add columns to a dataframe
# You can google this 
# e.g https://pandas.pydata.org/docs/getting_started/intro_tutorials/05_add_columns.html

In [None]:
# Separate the resulting data into non-diabetics and diabetics
non_diabetics_df = diabetes_df[diabetes_df["outcome"]==0]
diabetics_df = diabetes_df[diabetes_df["outcome"]==1] 
# DO not confuse between "diabetics_df" and "diabetes_df"!!!!!!!!

a. Plotting of categorical data on separate charts

In [None]:
# Plot the charts using Seaborn's countplot()

# AS 2 separate plots side by side
fig, ax = plt.subplots(1,2)
ax1 = sns.countplot(x="blood_pressure_category", data=non_diabetics_df, ax=ax[0]) # Non-diabetics dataset
ax2 = sns.countplot(x="blood_pressure_category", data=diabetics_df, ax=ax[1]) # Diabetics dataset

# If the default labels are too small - adjust them to make them bigger
#ax1.set_title("Healthy") 
#ax1.set_xlabel("Blood Pressure")
#ax2.set_title("Diabetics")
#ax2.set_xlabel("Blood Pressure")
# Adjust the font sizes
ax1.set_title("Healthy", fontsize=25)
ax1.set_xlabel("Blood Pressure", fontsize=25)
ax2.set_title("Diabetics", fontsize=25)
ax2.set_xlabel("Blood Pressure", fontsize=25)
ax1.tick_params(axis='both', which='both', labelsize=30) # x/y axes, minor/major ticks for ax1
ax2.tick_params(axis='both', which='both', labelsize=30) # x/y axes, minor/major ticks for ax2


# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('blood_pressure_category-side-by-side.png', dpi=300) # dpi=300 is print quality

b. Plotting of categorical data on a single chart


In [None]:
# Plot the charts using Seaborn's countplot()

# As a single plot
sns.countplot(x='blood_pressure_category', 
              data=diabetes_df, # Whole dataset (non-diabetics + diabetics)
              hue='outcome') # grouped by outcome

# If the default labels are too small - adjust them to make them bigger
#plt.title('Blood Pressure - Healthy vs Diabetics')
#plt.xlabel("Blood Pressure") # Change the xlabel - "blood_pressure_category" not user friendly
#plt.ylabel("Count")
# Adjust the font sizes
plt.title('Blood Pressure - Healthy vs Diabetics', fontsize=30)
plt.xlabel("Blood Pressure", fontsize=30)
plt.ylabel("Count", fontsize=30)
plt.tick_params(axis='both', which='both', labelsize=20) # x/y axes, minor/major ticks for plt

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('blood_pressure_category-single.png', dpi=300) # dpi=300 is print quality

# 6. Univariate and bivariate visualisations

a. Example of univariate visualisation

In [None]:
# This is a quick example of plotting the distribution of a continuous variable (age) as a histogram.
# We have used the distplot before
# So this a a barebones example 
# Need to customize the plot for a more decent appearence if you want to include it in a management report

sns.histplot(data=diabetics_df, x="age", color="green", kde=True)

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('age-histogram.png', dpi=300) # dpi=300 is print quality

b. Example of bivariate visualisation


In [None]:
# Regression plot
# Ref: https://seaborn.pydata.org/generated/seaborn.regplot.html

sns.regplot(x="age", y="num_pregnancies", data=diabetes_df, 
            scatter_kws={"color": "green"}, line_kws={"color": "red"})

plt.title('Pregnancies by age', fontsize = 30)
plt.xlabel("Age", fontsize=30)
plt.ylabel("Number of pregnancies", fontsize=30)
plt.tick_params(axis='both', which='both', labelsize=25) # x/y axes, minor/major ticks for plt

# If you want to save the graph to disk
plt.tight_layout() # automatically adjusts subplot to fit in the figure area
plt.savefig('scatterplot-age-pregnancies.png', dpi=300) # dpi=300 is print quality