# Weighted Average

* What's the gender percentage in each class?
* What's the overall gender percentage on Titanic

In [1]:
# import packages
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
# Load data into dataframe
titanic_df = sns.load_dataset('titanic')
titanic_df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [13]:
#Create a list of unique genders
gender_list = sorted(titanic_df['sex'].unique())
gender_list

['female', 'male']

In [4]:
# Count how many people are in each class. Sorted it so it's listed First, Second, Third
class_count = titanic_df['class'].value_counts().sort_index()
class_count

First     216
Second    184
Third     491
Name: class, dtype: int64

In [5]:
# Count how many people in each gender are in each class
gender_count = titanic_df.groupby('sex')['class'].value_counts()
gender_count

sex     class 
female  Third     144
        First      94
        Second     76
male    Third     347
        First     122
        Second    108
Name: class, dtype: int64

In [6]:
# We can slice gender_count by the index sex
gender_count['female']

class
Third     144
First      94
Second     76
Name: class, dtype: int64

Let's put this into a dataframe.

In [7]:
df = pd.DataFrame(class_count).reset_index()
df

Unnamed: 0,index,class
0,First,216
1,Second,184
2,Third,491


In [8]:
#Rename the column
df.columns = ['class', 'total']
df

Unnamed: 0,class,total
0,First,216
1,Second,184
2,Third,491


In [10]:
#slicing
gender_count['female']

class
Third     144
First      94
Second     76
Name: class, dtype: int64

In [11]:
#Create new column for female
df['female'] = df['class'].map(gender_count['female'])
df

Unnamed: 0,class,total,female
0,First,216,94
1,Second,184,76
2,Third,491,144


In [12]:
#Calculate percentage of female of the class
df['rel_female'] = df['female']/df['total']
df

Unnamed: 0,class,total,female,rel_female
0,First,216,94,0.435185
1,Second,184,76,0.413043
2,Third,491,144,0.293279


Let's make a loop to make the same columns for male.

In [15]:
for gender in gender_list:
    df[gender] = df['class'].map(gender_count[gender])
    df[f'rel_{gender}'] = df[gender]/df['total']
df

Unnamed: 0,class,total,female,rel_female,male,rel_male
0,First,216,94,0.435185,122,0.564815
1,Second,184,76,0.413043,108,0.586957
2,Third,491,144,0.293279,347,0.706721


What's the total gender percentage for Titanic?

In [16]:
#The wrong way to calculate it
df['rel_female'].mean()

0.3805025619497711

In [17]:
#Let's double check
df.sum()['female']/df.sum()['total']

0.35241301907968575

In [27]:
np.average(df['rel_female'], weights=df.total)

0.35241301907968575

Use weighted averages. ```np.average()```

In [25]:
for gender in gender_list:
    percentage = np.average(df[f'rel_{gender}'], weights=df['total'])
    print(f'The overall percentage of {gender} on the Titanic is {percentage * 100:.2f}%.')

The overall percentage of female on the Titanic is 35.24%
The overall percentage of male on the Titanic is 64.76%
