In [1]:
# I will want pandas to manipulate my data
import pandas as pd 
# I will want numpy for calculating the descriptive statistics
import numpy as np

In [2]:
# The data is sourced from the Central Statistics Office (CSO) of Ireland
# The dataset is titled "Population by Age Group and Sex"
# The data is available in CSV format via the CSO API
url = "https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/FY006A/CSV/1.0/en"

# read in the data from the csv file into a pandas DataFrame
df = pd.read_csv(url)

# lets take a look at the first few rows of the DataFrame
print(f"{df.head()}\n")

   STATISTIC Statistic Label  TLIST(A1)  CensusYear C02199V02655         Sex  \
0  FY006AC01      Population       2022        2022            -  Both sexes   
1  FY006AC01      Population       2022        2022            -  Both sexes   
2  FY006AC01      Population       2022        2022            -  Both sexes   
3  FY006AC01      Population       2022        2022            -  Both sexes   
4  FY006AC01      Population       2022        2022            -  Both sexes   

  C02076V03371 Single Year of Age                          C03789V04537  \
0            -           All ages                                   IE0   
1            -           All ages  2ae19629-1492-13a3-e055-000000000001   
2            -           All ages  2ae19629-1433-13a3-e055-000000000001   
3            -           All ages  2ae19629-149f-13a3-e055-000000000001   
4            -           All ages  2ae19629-14a0-13a3-e055-000000000001   

                 Administrative Counties    UNIT    VALUE  
0       

In [3]:
# Lets views the column headers to see what columns we want to drop
headers = df.columns.tolist()
print(f"{headers}\n")



['STATISTIC', 'Statistic Label', 'TLIST(A1)', 'CensusYear', 'C02199V02655', 'Sex', 'C02076V03371', 'Single Year of Age', 'C03789V04537', 'Administrative Counties', 'UNIT', 'VALUE']



In [4]:
# we only want data for Ireland as a whole, so let's filter the DataFrame to only include rows where the Administrative Counties column is Ireland
df=df[df["Administrative Counties"]== "Ireland"]
df.to_csv("gender_population_for_analysis.csv")


In [5]:
# so there are quite a few columns we don't need for our analysis, so let's drop them

# first we create a list of the columns to drop
drop_col_list = ["STATISTIC","Statistic Label","TLIST(A1)","CensusYear","C02199V02655","C02076V03371","C03789V04537",'Administrative Counties', "UNIT"]
# then we feed the list of columns to drop into the pandas drop() method
df.drop(columns=drop_col_list, inplace=True)
# let's view the column headers again to confirm the columns were dropped 
headers = df.columns.tolist()
print(f"{headers}\n")


['Sex', 'Single Year of Age', 'VALUE']



In [6]:
# let's have a quick look at the first few rows of the cleaned DataFrame
print(f"{df.head()}")
df.to_csv("gender_population_for_analysis.csv")  

            Sex Single Year of Age    VALUE
0    Both sexes           All ages  5149139
32   Both sexes       Under 1 year    57796
64   Both sexes             1 year    56420
96   Both sexes            2 years    59210
128  Both sexes            3 years    60484


In [7]:
# From our gender_population_for_analysis.csv file, we have three entries for Sex, (both sexes, male, and female) but we only want male and female,
# so let's remove both sexes from the DataFrame 
df = df[df["Sex"]!="Both sexes"]
# just to see what the first few rows look like now
print(f"{df.head()}")
#now to view that our CSV file does not include both sexes entries
df.to_csv("gender_population_for_analysis.csv")  

       Sex Single Year of Age    VALUE
3264  Male           All ages  2544549
3296  Male       Under 1 year    29610
3328  Male             1 year    28875
3360  Male            2 years    30236
3392  Male            3 years    31001


In [8]:
## From our gender_population_for_analysis.csv file, we have entries for All ages which we do not need for our analysis
# so let's remove all ages from the DataFrame 
df = df[df["Single Year of Age"]!="All ages"]
# just to see what the first few rows look like now
print(f"{df.head()}")
#now to view that our CSV file does not include all ages
df.to_csv("gender_population_for_analysis.csv")  

       Sex Single Year of Age  VALUE
3296  Male       Under 1 year  29610
3328  Male             1 year  28875
3360  Male            2 years  30236
3392  Male            3 years  31001
3424  Male            4 years  31686


In [9]:
# we also don't want under 1 year of age, so let's convert that to 0 
df["Single Year of Age"] = df["Single Year of Age"].str.replace("Under 1 year","0")

# we also don't want over 100 year of age, so let's convert that to 100 
df["Single Year of Age"] = df["Single Year of Age"].str.replace("100 years and over","100")

# we also don't want the word years, so let's remove text in that series altogether
df["Single Year of Age"] = df["Single Year of Age"].str.replace("\D", "", regex=True)

# now we can convert the Single Year of Age column to integer data type
df["Single Year of Age"] = df["Single Year of Age"].astype("Int64")

# and likewise with the VALUE column 
df["VALUE"] = df["VALUE"].astype("Int64")

print(f"{df.tail()}\n")

#now to view that our CSV file shows cleaned age data
df.to_csv("gender_population_for_analysis.csv") 


         Sex  Single Year of Age  VALUE
9632  Female                  96    956
9664  Female                  97    732
9696  Female                  98    492
9728  Female                  99    336
9760  Female                 100    584



In [10]:
headers = df.columns.tolist()
print(f"{headers}\n")

['Sex', 'Single Year of Age', 'VALUE']



In [11]:
# Let's create a pivot table from the cleaned DataFrame for our analysis

df_analysis = pd.pivot_table(df, "VALUE","Single Year of Age","Sex")
# let's see what the first few rows of the analysis DataFrame look like
print(f"{df_analysis.head(5)}\n")


df_analysis.to_csv("gender_population_for_analysis.csv")   


Sex                  Female     Male
Single Year of Age                  
0                   28186.0  29610.0
1                   27545.0  28875.0
2                   28974.0  30236.0
3                   29483.0  31001.0
4                   29819.0  31686.0



In [12]:
#just to list our headers in the analysis DataFrame
headers = list(df_analysis.columns)[0:]
print(f"{headers}\n") 
# so now we know we have two possible values for columns, male an female


['Female', 'Male']



In [13]:
# weighted mean
# weighted mean is sum (age * female population at age) / (female populations at age)

#Let's assume we take the first column, female
gender = headers[0]
weighted_mean= np.average(df_analysis.index, weights=df_analysis[gender])
print(f"The weighted mean age of {gender} is {round(weighted_mean,1)}\n")

gender = headers[1]
weighted_mean= np.average(df_analysis.index, weights=df_analysis[gender])
print(f"The weighted mean age of {gender} is {round(weighted_mean,1)}\n")

The weighted mean age of Female is 38.9

The weighted mean age of Male is 37.7



In [14]:
# Weighted median

# the data is already sorted by age 
# df.sort_values('Single Year of Age', inplace=True)

# create a series called the cumulative sum, and we find the index of the middle value

#Let's assume we take the first column, female
gender = headers[0]
cumsum = df_analysis[gender].cumsum()
cutoff = df_analysis[gender].sum() / 2.0
#print(f"The cumulative sum is: {cumsum}\n")
#print(f"The middle index is: {cutoff}\n")

#see where the cumulative sum is greater than or equal to the cutoff
#print(f"{df_analysis[gender][cumsum >= cutoff]}\n")
# to find the index of the first value where this is true we find the index 0 of this series
#print(f"{df_analysis[gender][cumsum >= cutoff].index[0]}\n")
median = df_analysis[gender][cumsum >= cutoff].index[0]

#median = df_analysis["Single Year of Age"][cumsum >= cutoff].iloc[0]
print(f"The weighted median of {gender} is {median}\n")

gender = headers[1]
cumsum = df_analysis[gender].cumsum()
cutoff = df_analysis[gender].sum() / 2.0
#print(f"The cumulative sum is: {cumsum}\n")
#print(f"The middle index is: {cutoff}\n")

#see where the cumulative sum is greater than or equal to the cutoff
#print(f"{df_analysis[gender][cumsum >= cutoff]}\n")
# to find the index of the first value where this is true we find the index 0 of this series
#print(f"{df_analysis[gender][cumsum >= cutoff].index[0]}\n")
median = df_analysis[gender][cumsum >= cutoff].index[0]

#median = df_analysis["Single Year of Age"][cumsum >= cutoff].iloc[0]
print(f"The weighted median of {gender} is {median}\n")



The weighted median of Female is 39

The weighted median of Male is 38



In [15]:
gender = headers[0]
mode = df_analysis[gender].idxmax()
print(f"The weighted mode age of {gender} is {mode}\n")

gender = headers[1]
mode = df_analysis[gender].idxmax()
print(f"The weighted mode age of {gender} is {mode}\n")


The weighted mode age of Female is 41

The weighted mode age of Male is 42



In [16]:
# Weighted Variance
gender = headers[0]
w_variance_female = np.average((df_analysis.index - weighted_mean)**2, weights=df_analysis[gender])
print(f"The weighted variance of age for {gender} is: {round(w_variance_female,1)}\n")

gender = headers[1]
w_variance_male = np.average((df_analysis.index - weighted_mean)**2, weights=df_analysis[gender])
print(f"The weighted variance of age for {gender} is: {round(w_variance_male,1)}\n")

The weighted variance of age for Female is: 530.4

The weighted variance of age for Male is: 514.0



In [17]:
# Weighted Standard Deviation
gender = headers[0]
w_stddev_female = np.sqrt(w_variance_female)
print(f"The weighted standard deviation of age for {gender} is: {round(w_stddev_female,1)}\n")

gender = headers[1]
w_stddev_male = np.sqrt(w_variance_male)
print(f"The weighted standard deviation of age for {gender} is: {round(w_stddev_male,1)}\n")

The weighted standard deviation of age for Female is: 23.0

The weighted standard deviation of age for Male is: 22.7



Part 2 20%

In the same notebook, make a variable that stores an age (say 35).

Write that code that would group the people within 5 years of that age together, into one age group 

- maybe make 35 the index, then use iloc of above and below that age.

Calculate the population difference between the sexes in that age group.

- sum up the total males and female in those rows

Part 3 10%

In the same notebook.

Write the code that would work out which region in Ireland has the biggest population difference between the sexes in that age group

So the biggest population difference would be standard deviation. 
In English, we need the descriptive statistics for each administrative county,
and compare them
