In [116]:

import os
import sys
import itertools
from ummalqura.hijri_date import HijriDate
from datetime import date
import glob 
import numpy as np 
import pandas as pd 
import plotly
from plotly.graph_objs import Layout,Histogram
import plotly.graph_objects as go
import seaborn as sns; sns.set()

import matplotlib.pyplot as plt
%matplotlib inline  

The data that will be used in this analysis was scarped using the method in get_data notebook. For time sake, I scarped only around 20k data samples accross multiple years. 

In [70]:
df = pd.read_csv("../data/data_m.csv")

In [71]:
df.head()

Unnamed: 0,name,sex,nationality,age,DOD
0,عيشه بنت محمد تكروني,أنثى,سعودي,,1364/08/29
1,عفيه بنت محمد,أنثى,سعودي,,1364/08/29
2,منير بنت عوده,أنثى,سعودي,2 سنة,1365/01/10
3,زهره علي بادري,أنثى,سعودي,50 سنة,1365/04/17
4,زينب بنت عقل سي,أنثى,سعودي,,1365/05/22


In [72]:
df.shape

(20499, 5)

Okay we have 20499 samples. let's clean up the features

In [73]:
# check missing values
df.count()

name           20498
sex            20499
nationality    20378
age            20460
DOD            20499
dtype: int64

In [74]:
# name has one, drop it
df = df.dropna(subset=["name"])

I will keep the nationality column just for the sake of the analysis, otherwise I don't think it matters in our task 

In [75]:
# for the nationality, let's assume that the missing values are Saudies since the sample majority is Saudies
df.nationality = df.nationality.fillna("سعودي")

In [76]:
# from what I noticed from the names column, most nan values in age is for new born babies who died 
# (sometimes they are named "the baby of [name of the father]")
# for that reason I will put them all in one category of 1 year old, along with any age value lower than 1 year old
df.age = df.age.fillna('1 سنة')
year_vocab = ['سنة', 'سنوات']
df.age = df.age.apply(lambda x: x if x.split()[1] in year_vocab else '1 سنة' )
# now just keep the numerical value since we know it's in years
df.age = df.age.apply(lambda x: int(x.split()[0]))

In [77]:
df.head()

Unnamed: 0,name,sex,nationality,age,DOD
0,عيشه بنت محمد تكروني,أنثى,سعودي,1,1364/08/29
1,عفيه بنت محمد,أنثى,سعودي,1,1364/08/29
2,منير بنت عوده,أنثى,سعودي,2,1365/01/10
3,زهره علي بادري,أنثى,سعودي,50,1365/04/17
4,زينب بنت عقل سي,أنثى,سعودي,1,1365/05/22


In [78]:
# let's seperate years from months
df["DOD_y"] = df.DOD.apply(lambda x: int(x.split("/")[0]))
df["DOD_m"] = df.DOD.apply(lambda x: int(x.split("/")[1]))

In [79]:
# Since it's all about Shaban, let's make a column for it
df["is_shaban"] = df.DOD_m.apply(lambda x: 1 if x==8 else 0)

In [80]:
df.head()

Unnamed: 0,name,sex,nationality,age,DOD,DOD_y,DOD_m,is_shaban
0,عيشه بنت محمد تكروني,أنثى,سعودي,1,1364/08/29,1364,8,1
1,عفيه بنت محمد,أنثى,سعودي,1,1364/08/29,1364,8,1
2,منير بنت عوده,أنثى,سعودي,2,1365/01/10,1365,1,0
3,زهره علي بادري,أنثى,سعودي,50,1365/04/17,1365,4,0
4,زينب بنت عقل سي,أنثى,سعودي,1,1365/05/22,1365,5,0


----
### Death value count by month

In [82]:

trace1 = go.Bar(x=df.DOD_m.value_counts().index, y=df.DOD_m.value_counts().values)
layout = go.Layout(
    title="Number of Deaths per Month",
    xaxis = go.layout.XAxis(
        tickmode = 'linear'
    )
)
fig = go.Figure(data=[trace1], layout=layout)

fig.show()

Shaban (8) doesn't have the most number of deaths. Let's look deeper into sex and age groups

### By sex groups

In [83]:
by_sex = df.groupby(["DOD_m"])["sex"].value_counts().unstack().reset_index()

In [84]:
by_sex.head()

sex,DOD_m,أنثى,ذكر
0,1,725,1143
1,2,719,1120
2,3,582,944
3,4,532,813
4,5,495,728


In [85]:


trace1 = go.Bar(
    x=by_sex["DOD_m"].values,
    y=by_sex["ذكر"].values,
    name='Males',
    marker_color='indianred'
)
trace2  = go.Bar(
    x=by_sex["DOD_m"].values,
    y=by_sex["أنثى"].values,
    name='Females',
    marker_color='lightsalmon'
)
data = [trace1, trace2]
layout = go.Layout(
    title= "Number of Deaths by Sex per Month",
    xaxis = go.layout.XAxis(
        tickmode = 'linear'
    ), barmode='group' )
    
fig = go.Figure(data=data, layout=layout)
# Here we modify the tickangle of the xaxis, resulting in rotated labels.
fig.show()

The trend is the same among all months, more deaths in males tho

----

### Divide by age groups
since the saying the Shaban has the most deaths comes from old people, it make sence that we look at the deaths in different groups. 
We will assume that some death as certain ages doesn't leave impact as much as others

In [89]:
# we will divide age groups from 1-20 as young, 21-50 as mid-age, and 51-100 as senior

df["age_groups"] = pd.cut(df.age,[0,20,50,100], labels=["young", "mid-age", "senior"])

In [114]:
def highlight_max(s):
    '''
    highlight the maximum in a Series yellow.
    '''
    is_max = s == s.max()
    return ['background-color: yellow' if v else '' for v in is_max]

In [113]:
cm = sns.light_palette("green", as_cmap=True)
df.groupby(["age_groups", "DOD_m"])["name"].count().unstack().reset_index().style.background_gradient(cmap=cm).apply(highlight_max,subset=[i for i in range(1,13)])

DOD_m,age_groups,1,2,3,4,5,6,7,8,9,10,11,12
0,young,708,850,644,630,555,577,566,682,723,651,618,796
1,mid-age,184,208,191,163,117,131,133,167,242,195,176,261
2,senior,969,777,685,550,546,607,635,809,1106,755,1161,1660


It seems that young people die more between months 2-5, whereas senior people die at the end of the year and the start of the new year

What about age groups and sex groups together? 

In [109]:
df.groupby(["age_groups", "DOD_m","sex"])["name"].count().unstack().reset_index().style\
.bar(subset=["أنثى","ذكر"], align='mid', color=['#d6005f', '#5fba7d'])


sex,age_groups,DOD_m,أنثى,ذكر
0,young,1,279,429
1,young,2,334,516
2,young,3,250,394
3,young,4,241,389
4,young,5,234,321
5,young,6,222,355
6,young,7,233,333
7,young,8,269,413
8,young,9,290,433
9,young,10,227,424


It doesn't tell us anything special about Shaban. 

In [124]:
v_counts_index, v_counts_values= zip(*sorted(zip(df.DOD_m.value_counts().index, df.DOD_m.value_counts().values)))
trace1 = go.Scatter(x=v_counts_index, y=v_counts_values,
                mode='lines+markers')

layout = go.Layout(
    title= "Trend for number of Deaths",
    xaxis = go.layout.XAxis(
        tickmode = 'linear'
    ))
fig = go.Figure(data=[trace1], layout=layout)
    

fig.show()

## Conclusion

my conclusion is based on two observations: 
1 - The trend plot above, it shows that Shaban is the month where deaths start to rise until the beginning of the next year.
2 - From the analysis that was done on senior people, Shaban is when the trend start as well. 
Those two observations might give Shaban a psychological significance, especially among older people. 

On another note, I think this notion of deaths are more common in Shaban is caused by the high anticipation of Ramadan. Dying a month before Ramadan is more significant than any other time.

~ The End 