In [1]:
# I will want pandas to manipulate my data
import pandas as pd 
# I will want numpy for calculating the descriptive statistics
import numpy as np

# Reading in the Data
We will read in the data in a the form of a CSV file from the CSO.

In [2]:
# The data is sourced from the Central Statistics Office (CSO) of Ireland
# The dataset is titled "Population by Age Group and Sex"
# The data is available in CSV format via the CSO API
url = "https://ws.cso.ie/public/api.restful/PxStat.Data.Cube_API.ReadDataset/FY006A/CSV/1.0/en"

# read in the data from the csv file into a pandas DataFrame
df = pd.read_csv(url)

# lets take a look at the first few rows of the DataFrame
print(f"{df.head()}\n")

   STATISTIC Statistic Label  TLIST(A1)  CensusYear C02199V02655         Sex  \
0  FY006AC01      Population       2022        2022            -  Both sexes   
1  FY006AC01      Population       2022        2022            -  Both sexes   
2  FY006AC01      Population       2022        2022            -  Both sexes   
3  FY006AC01      Population       2022        2022            -  Both sexes   
4  FY006AC01      Population       2022        2022            -  Both sexes   

  C02076V03371 Single Year of Age                          C03789V04537  \
0            -           All ages                                   IE0   
1            -           All ages  2ae19629-1492-13a3-e055-000000000001   
2            -           All ages  2ae19629-1433-13a3-e055-000000000001   
3            -           All ages  2ae19629-149f-13a3-e055-000000000001   
4            -           All ages  2ae19629-14a0-13a3-e055-000000000001   

                 Administrative Counties    UNIT    VALUE  
0       

# Cleaning/Simplifying the data
While the data might be clean, there may be a lot of superfluous information in it that we can remove to simplify things.

In [3]:
# we only want data for Ireland as a whole - not all the administrative counties, 
# so let's filter the DataFrame to only include rows 
# where the Administrative Counties column is equal to "Ireland"
# We set our dataframe "df" equal that contains a dataframe comprising of data where for every row, 
# the entry for administrative county is Ireland
df=df[df["Administrative Counties"]== "Ireland"]
df.to_csv("gender_population_for_analysis.csv")


In [4]:
# Lets view all the column headers to see what columns we can drop
headers = df.columns.tolist()
print(f"{headers}\n")

['STATISTIC', 'Statistic Label', 'TLIST(A1)', 'CensusYear', 'C02199V02655', 'Sex', 'C02076V03371', 'Single Year of Age', 'C03789V04537', 'Administrative Counties', 'UNIT', 'VALUE']



In [5]:
# so there are quite a few columns we don't need for our analysis, so let's drop them

# first we create a list of the columns to drop, 
# we can drop administrative counties now because we have already filterd the data for just Ireland
drop_col_list = ["STATISTIC","Statistic Label","TLIST(A1)","CensusYear","C02199V02655","C02076V03371","C03789V04537",'Administrative Counties', "UNIT"]
# then we feed the list of columns to drop into the pandas drop() method, in place means remove the data from the existing dataframe
# we could alternatively have created a new dataframe and set it equal to the columns that we only want to be left
df.drop(columns=drop_col_list, inplace=True)
# let's view the column headers again to confirm the columns were dropped 
headers = df.columns.tolist()
print(f"{headers}\n")


['Sex', 'Single Year of Age', 'VALUE']



In [6]:
# let's have a quick look at the first few rows of the cleaned DataFrame
print(f"{df.head()}")
df.to_csv("gender_population_for_analysis.csv")  

            Sex Single Year of Age    VALUE
0    Both sexes           All ages  5149139
32   Both sexes       Under 1 year    57796
64   Both sexes             1 year    56420
96   Both sexes            2 years    59210
128  Both sexes            3 years    60484


In [7]:
# From our gender_population_for_analysis.csv file, we have three entries for Sex, 
# (both sexes, male, and female) but we only want male and female,
# so let's remove both sexes from the DataFrame 
df = df[df["Sex"]!="Both sexes"]
# just to see what the first few rows look like now
print(f"{df.head()}")
#now to view that our CSV file does not include both sexes entries
df.to_csv("gender_population_for_analysis.csv")  

       Sex Single Year of Age    VALUE
3264  Male           All ages  2544549
3296  Male       Under 1 year    29610
3328  Male             1 year    28875
3360  Male            2 years    30236
3392  Male            3 years    31001


In [8]:
## From our gender_population_for_analysis.csv file, 
# we have entries for "All ages" which we also do not need for our analysis
# so let's remove all ages from the DataFrame 
df = df[df["Single Year of Age"]!="All ages"]
# just to see what the first few rows look like now
print(f"{df.head()}")
#now to view that our CSV file does not include all ages
df.to_csv("gender_population_for_analysis.csv")  

       Sex Single Year of Age  VALUE
3296  Male       Under 1 year  29610
3328  Male             1 year  28875
3360  Male            2 years  30236
3392  Male            3 years  31001
3424  Male            4 years  31686


# Reformating the Data
There are some entries in the data that make it difficult for us to work with,
so let's simplify things for ourselves

In [9]:
# we also don't want under 1 year of age, so let's convert that to 0 
df["Single Year of Age"] = df["Single Year of Age"].str.replace("Under 1 year","0")

# we also don't want over 100 year of age, so let's convert that to 100 
df["Single Year of Age"] = df["Single Year of Age"].str.replace("100 years and over","100")

# we also don't want the word years, so let's remove text in that series altogether
df["Single Year of Age"] = df["Single Year of Age"].str.replace("\D", "", regex=True)

# now we can convert the Single Year of Age column to integer data type
df["Single Year of Age"] = df["Single Year of Age"].astype("Int64")

# and likewise with the VALUE column 
df["VALUE"] = df["VALUE"].astype("Int64")

print(f"{df.tail()}\n")

#now to view that our CSV file shows cleaned age data
df.to_csv("gender_population_for_analysis.csv") 


         Sex  Single Year of Age  VALUE
9632  Female                  96    956
9664  Female                  97    732
9696  Female                  98    492
9728  Female                  99    336
9760  Female                 100    584



In [10]:
# just another confirmation of the columns we have remaining
headers = df.columns.tolist()
print(f"{headers}\n")


['Sex', 'Single Year of Age', 'VALUE']



# Create a Pivot Table
Now that we have the data simplified down to Single Year of Age and Gender(s), 
and values each combination of age and gender, putting it in a table allows us to work with the data

In [11]:
# Let's create a pivot table from the cleaned DataFrame for our analysis
df_analysis = pd.pivot_table(df, "VALUE","Single Year of Age","Sex")

# let's see what the first few rows of the analysis DataFrame look like
print(f"{df_analysis.head(5)}\n")

#now to view that our CSV file shows cleaned age data
df_analysis.to_csv("gender_population_for_analysis.csv")

# Note Single Year of Age is now our index, it is not a column,
# this is something we need to be wary of later when performing calculations between columns.


Sex                  Female     Male
Single Year of Age                  
0                   28186.0  29610.0
1                   27545.0  28875.0
2                   28974.0  30236.0
3                   29483.0  31001.0
4                   29819.0  31686.0




Normally, if we calculate the mean of a series of figures we just sum them and divide by the number of observations.<br/> 
But, in this case, in our population sample, the people are not evenly distributed across every single age.<br/>

We must weight each age by the number of people at that age. This is why we use the weighted mean.<br/>
The same is the case for the weighted variance and standard deviation.

# Weighted Mean (Average)

The formula for calculating the weighted mean is as follows:<br/>
<img src="https:\\www.mathcracker.com\images\legacy\weighted-average-calculator.png" /><br/>

In our case, the weights (w) are the number of people at each age while the values (x) are all the ages. <br/>
So, the weighted mean in our case is: sum (each age * female population at that age) / sum (female population)

In [12]:
# In our dataframe, we have ages (0-100) as our index and then two columns, female and male, 
# with the numbers of males and female at each age.

#Let's assume we take the first column, females at each age, in our dataframe, that is column 0.
gender = headers[0]
# We are setting our variable gender equal to all the values in the female column
# Using numpy, we give it the column containing the ages, that is, the index as X
# and we tell numpy that our weights w are the value in the column 0 of the dataframe df.
weighted_mean_female= np.average(df_analysis.index, weights=df_analysis[gender])
print(f"The weighted mean age of {gender}s is {round(weighted_mean_female,1)} years.\n")

gender = headers[1]
weighted_mean_male= np.average(df_analysis.index, weights=df_analysis[gender])
print(f"The weighted mean age of {gender}s is {round(weighted_mean_male,1)} years.\n")

# Their averages are quite close, only one year between them.

KeyError: 'Sex'

# Weighted Median

While often confused with the mean, the median is what the mean would be <br/>
if it was adjusted for certain values skewing the overall dataset.<br/>
It does this by splitting the data equally in two in terms of numbers of observations, not size, 
and then focuses on what the mid-point is, it's one value if we have an odd number of observations <br/>   
because we can split the data in even numbers of observations around that value.<br/>
But, the median can be equal to the average of two values if we have an even number of observations <br/>
because we can have an equal number of observations either side of those two values.

In [None]:
# First, we must sort the data. In our case, the data is already sorted by age (from 0 up to 100) 
# If it wasn't already sorted, we might sort it as follows.
df.sort_values('Single Year of Age', inplace=True)

# We must create a series called the cumulative sum, and incrementally add up all the values at each age
# Then we can simply halve that, and find the index of that middle value

#Let's assume we take the first column, female
gender = headers[0]
cumsum_female = df_analysis[gender].cumsum()
cutoff_female = df_analysis[gender].sum() / 2.0

# So, adding up all the observations at each age tells us there are 2,604,590 women
#print(f"The cumulative sum is: {cumsum_female}\n")
# and the woman in the middle is in the 1,302,295th position 
#print(f"The middle index is: {cutoff_female}\n")

# To find where that woman in the 1,302,295th position is,
# starting at age 0, we add up the numbers of observations at each age until we reach the 1,302,295th woman.
# This happens where the cumulative sum is greater than or equal to the cutoff (middle index)
# To find the index of the first value where this is true 
# we take out dataframe,select the relevant gender column and then the row 
# where the cumulative sum which we calcualted above is greater than the middle figure which we calculated above.
median_female = df_analysis[gender][cumsum_female >= cutoff_female].index[0]

# Alternatively, becuase Single Year of Age is our index, we could reset it as a column
# df_analysis=df_analysis.reset_index()
# then find the age index at that 1,302,295th position as follows
#median = df_analysis["Single Year of Age"][cumsum_female >= cutoff_female].iloc[0]
# just to see how the dataframe looks with the reset index
#print(f"{df_analysis.head()}")

print(f"The weighted median age of {gender}s is {median_female}\n")

gender = headers[1]
cumsum_male = df_analysis[gender].cumsum()
cutoff_male = df_analysis[gender].sum() / 2.0
#print(f"The cumulative sum is: {cumsum_male}\n")
#print(f"The middle index is: {cutoff_male}\n")

median_male = df_analysis[gender][cumsum_male >= cutoff_male].index[0]

print(f"The weighted median age of {gender}s is {median_male}\n")

The median ages of males and females are very close to one another, and indeed are both very close to their respective means.<br/>
This is intuitive because it is unlikely that we would abnormally large numbers of observations at either end of the age scale.

## Weighted Mode

The mode is the most popular occurance of a variable. In this context, what age has the highest numbers of observations.<br/>
If we think about this, we are simply looking for the maximum value in the column, and then the age index that maximum occurs.


Note that I used the max() function in pandas to calculate the maximum of the relevant gender column. 
Then I went looking inside the dataframe df_analysis, under the relevant gender column,
until that part of the dataframe was equal to the highest_observation. In effect, I masked all other values as false.

Another alternative I found online was to use [idxmax()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.idxmax.html) 
I showed how to use that also.

In [None]:
gender = headers[0]
highest_observations= df_analysis[gender].max()
print(f"The highest number of observations in any one female age was: {highest_observations}\n")

# let's look in the df_analysis dataframe, under the relevant gender column where that value occurs
# and at what index ( that is, age)
mode_female = df_analysis[gender][df_analysis[gender]==highest_observations].index[0]
print(f"The weighted mode age of {gender} is {mode_female}\n")

# althernaive method
#mode_female = df_analysis[gender].idxmax()
#print(f"The weighted mode age of {gender} is {mode_female}\n")


gender = headers[1]
highest_observations= df_analysis[gender].max()
print(f"The highest number of observations in any one male age was: {highest_observations}\n")

# let's look in the df_analysis dataframe, under the relevant gender column where that value occurs
# and at what index ( that is, age)
mode_male = df_analysis[gender][df_analysis[gender]==highest_observations].index[0]
print(f"The weighted mode age of {gender} is {mode_male}\n")

# alternative method
# mode_male = df_analysis[gender].idxmax()
# print(f"The weighted mode age of {gender} is {mode_male}\n")


Similar to their means and medians, the most popular ages for men and women are close together, at 41 and 42.<br/>
Interestingly, the mode ages are relatively close to the media ages, which suggests relatively similar datasets to date.

# Weighted Variance

The variance tells us how dispersed the data is around the mean.<br/> 
The formula for calculating the weighted variance is as follows:<br/>
<img src="https://help.altair.com/panopticon/authoring/assets/images/weighted.png"/><br/>
We will simplify this to variance being the AVERAGE of the square of the differences from the mean


In [None]:
# we have already calculated our weighted means up above
# We can now simply use numpy to calculate the average of the squared difference from the mean
# then divide this by the number of that gender being observed

gender = headers[0]
weighted_variance_female = np.average((df_analysis.index - weighted_mean_female)**2, weights=df_analysis[gender])
print(f"The weighted variance of age for {gender}s is: {round(weighted_variance_female,1)}\n")

gender = headers[1]
weighted_variance_male = np.average((df_analysis.index - weighted_mean_male)**2, weights=df_analysis[gender])
print(f"The weighted variance of age for {gender}s is: {round(weighted_variance_male,1)}\n")

While the variance for females is greater than that of men, it is not by much, which definitely tells us how similar their datasets are.

# Weighted Standard Deviation

One issue with variance is that its size is not denominated in the same terms as the underlying observations. This is attributable to us squaring the differentials in its calculation. Consequently, to bring it back into similar terms, we must calculate its square root, giving us a figure that we know as the standard deviation.

In [None]:
# we have already calculated our variances up above
# We can now simply use numpy to calculate the square roots of the variances

gender = headers[0]
weighted_stddev_female = np.sqrt(weighted_variance_female)
print(f"The weighted standard deviation of age for {gender} is: {round(weighted_stddev_female,1)}\n")

gender = headers[1]
weighted_stddev_male = np.sqrt(weighted_variance_male)
print(f"The weighted standard deviation of age for {gender} is: {round(weighted_stddev_male,1)}\n")

Given how relatively similar their variances were, we expect the standard deviation of male and female ages to be similar. With a standard deviation of 22 approximately, and mean ages of 38 approximately, we can say that 95% of the population (2 standard deviations) lie in the age group of 0 to 80 which is perhaps expected given that the average life expectancy would be in the 70s.

Part 2 20%

In the same notebook, make a variable that stores an age (say 35).

Write that code that would group the people within 5 years of that age together, into one age group 

- maybe make 35 the index, then use iloc of above and below that age.

Calculate the population difference between the sexes in that age group.

- sum up the total males and female in those rows

In [None]:
target_age = 35
lower_target = target_age - 5
upper_target = target_age + 6

#Group the people within 5 years of that age into one age group
# the simplest thing for me to do is to reset the index 
# so that Single Year of Age is a column that i can manipulate
df_analysis=df_analysis.reset_index()
df_population_subset= df_analysis.iloc[lower_target:upper_target, :]
print(f"{df_population_subset}")

Sex  level_0  index  Single Year of Age   Female     Male
30        30     30                  30  32841.0  30858.0
31        31     31                  31  33710.0  32237.0
32        32     32                  32  34382.0  32413.0
33        33     33                  33  34489.0  31888.0
34        34     34                  34  36284.0  33121.0
35        35     35                  35  37940.0  34695.0
36        36     36                  36  39030.0  35828.0
37        37     37                  37  39193.0  36427.0
38        38     38                  38  40902.0  37513.0
39        39     39                  39  42592.0  38749.0
40        40     40                  40  43143.0  40301.0


# Calculating the sum of the smaller sample
Now we can simply sum up the value in each column using the sum function.

In [None]:

# sum up the females
gender = headers[1]
total_number_female = df_population_subset[gender].sum()
print(f"The total amount of {gender}s in this subset is: {total_number_female}\n")

# sum up the males
gender = headers[2]
total_number_male = df_population_subset[gender].sum()
print(f"The total amount of {gender}s in this subset is: {total_number_male}\n")

# Now to calculate the population difference
if total_number_female > total_number_male:
    print(f"There are {(total_number_female - total_number_male)} more females than males in this subsample.")
else:
    print(f"There are {(total_number_male - total_number_female)} more males than females in this subsample.")


The total amount of Females in this subset is: 414506.0

The total amount of Males in this subset is: 384030.0

There are 30476.0 more females than males in this subsample


# Part 3 10%



In the same notebook.

Write the code that would work out which region in Ireland has the biggest population difference between the sexes in that age group

So the biggest population difference would be standard deviation. 
In English, we need the descriptive statistics for each administrative county,
and compare them
