# About this notebook

This notebook is created by Bella Ratmelia (bellar@smu.edu.sg) for SMU Libraries' Python 101 Part 2 bite-sized workshop on 20 September 2022

In [None]:
# import the necessary packages
import pandas as pd
import seaborn as sns
from scipy import stats

# Section 1: Dataframe 101
- reading from csv
- describe and info
- rename columns
- slicing data
- filtering data, group by
- counting values
- simple plots
- deriving new columns, dropping columns

In [None]:
# reading from CSV
# youth-survey.csv consists of responses from youths (15-30 years old) in Czech Republic 
data = pd.read_csv('youth-survey.csv')

In [None]:
data

## Retrieve basic info about the dataframe

In [None]:
# Use the DataFrame.info() method to find out more about a dataframe.


In [None]:
# The DataFrame.columns variable stores information about the dataframeâ€™s columns.
# This one doesn't have parentheses because it's not a function, but a 
# variable inside the dataframe object (member variable)



In [None]:
# quickly get the number of rows and columns of the dataframe



In [None]:
# DataFrame.describe() gets the summary statistics of the columns that have numerical data. 
# All other columns are ignored, unless you use the argument include='all'.



## Renaming columns

In [None]:
# Sometimes column names need to be renamed to mae it easier for us
# rename columns to be all lowercaps with no whitespace (replace whitespace with hyphen)
# rename them to something more meaningful


## Selecting a subset of dataframe ("slicing")

In [None]:
# Selecting a subset ("slicing") 
# get the age of participants


In [None]:
# Describe just a column


In [None]:
# get the height and age of participants


In [None]:
# Describe the two columns


### Try Yourself: Get the loneliness, happiness, and energy-levels columns

## Filtering the data to fit specified criteria

In [None]:
# Filtering: Get all data from participants above 18 years old


In [None]:
# Get data from Female participants above 18 years old


In [None]:
# get participants that stated their age


### Try Yourself: Get data from people whose happiness ratings are >= 3 or loneliness rating <= 3

In [None]:
# Even more granular filtering:
# get the internet-usage information of city-dwelling participants

# we can of course do it in two steps: filter the row based on the locality, and then slice the internet-usage column
# using .loc, we can filter both criteria at one go


In [None]:
# retrieve based on index number instead of column names or row values
# retrieve the first 3 rows only


In [None]:
# retrieve "lying" values (2nd column) of row 5 to 10
# use .iloc to perform this filtering+slicing in one go



### Try Yourself: Get the happiness and loneliness rating of participants with more than 1 siblings

## Updating values

In [None]:
# we can also update the values in dataframe, especially for the empty ones
# update the missing siblings value to 0
# inplace = True so that the changes are applied to the dataframe itself



### Try yourself: Update values
* Update the missing values in gender to "No Gender"
* Update the the values "left handed" to "l" and right handed to "r" (hint: you can use .loc for this!)

## Counting and Sorting Values

In [None]:
# Find out how many participants are female or male


In [None]:
# Find out how many participants are female or male from villages and towns


In [None]:
# sort the age of participants from youngest to oldest


### Try Yourself: deal with NaN values and sorting
* Include the NaN value when counting the number of female and male participants
* Sort participants based on happiness rating, from highest to lowest

## Creating new column based on other columns, dropping columns

In [None]:
# create a new column called "height-in-m", deriving from the "height" column


In [None]:
# drop a column


In [None]:
#save to a new CSV

## Simple plots with Seaborn

In [None]:
sns.set_theme(style="ticks", color_codes=True)

# create a histogram for Energy Levels data


In [None]:
# you can also create a scatterplot


# Section 2: Stats with Dataframe

- mean, mode, median, std, etc
- correlation

In [None]:
# Calculate the average age of the participant


In [None]:
# Calculate the median age of the participant


In [None]:
# What's the most common age among participants?


In [None]:
# what's the average age of female and male participants?


### Try Yourself: Find out the average loneliness rating of participants grouped by their gender and locality

## Statistical tests

In [None]:
#### inferential stats ####
# are there any relationship between these two variables?
# let's check the value for pearson's r (assuming we're treating the variables as continuous number


In [None]:
# the data has some NaN values, let's replace them with 0


In [None]:
# if we're treating the data as ordinal/categorical value, we can use spearman's rho or kendall's tau


In [None]:
# we can also quickly calculate the correlation coefficient between numerical variables
# and keep them in a matrix


In [None]:
# show the matrix in a heatmap using seaborn
