In [None]:
import pandas as pd
import seaborn
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
sourcedata = pd.read_csv('egl_DS_test_data.csv')

In [None]:
sourcedata.describe()

In [None]:
sourcedata.info()

In [None]:
for c in sourcedata.columns:
    print(c + ':', len(set(sourcedata[c].dropna())))

In [None]:
sourcedata.head()

In [None]:
# does each id_code have a single postcode and therefore be considered a single user

dist_pcds = sourcedata.groupby('id_code')['postcode.1'].nunique().reset_index()
dist_pcds.groupby('postcode.1').count().head()

In [None]:
# is there a good time series for a large number of users

dist_dates = sourcedata.groupby('id_code')['date'].nunique().reset_index()
date_users = dist_dates.groupby('date').count()

In [None]:
date_users.sum()

In [None]:
date_users[date_users.index > 10].sum()

In [None]:
set(sourcedata['anxiety'])

In [None]:
set(sourcedata['sex'])

In [None]:
enum = sourcedata.replace(
    {
        '0 - not at all anxious': 0,
        '10 - completely anxious': 10,
        'Male': 0,
        'M': 0,
        'Female': 1,
        'F': 1
    }
)
typed = enum.astype(
    {
        'anxiety': int,
        'sex': float,
    }
)
typed['date'] = pd.to_datetime(typed['date'])

In [None]:
typed.info()

In [None]:
typed.hist()

In [None]:
typed.to_csv('egl_typed.csv')

In [None]:
uk_ages = pd.read_csv('uk_ages.csv')

In [None]:
relevant_ages = uk_ages[uk_ages['age'].between(16, 89)]
relevant_ages['pc'] = (relevant_ages['people'] / sum(relevant_ages['people']))*100

In [None]:
app_ages = typed.groupby('dateofbirth_year')['id_code'].nunique().reset_index()
app_ages['pc'] = (app_ages['id_code'] / sum(app_ages['id_code']))*100

In [None]:
age_compare = relevant_ages.merge(
    app_ages,
    left_on='yearofbirth',
    right_on='dateofbirth_year'
)
age_compare.info()

In [None]:
age_diffs = [
    age_compare['age'], 
    age_compare['pc_x'], 
    age_compare['age'], 
    age_compare['pc_y']
]
fig, ax = plt.subplots(figsize=(8, 6))
ax.plot(
    *age_diffs,
    '#3eb64e'
)
del(age_diffs[2])
ax.fill_between(
    *age_diffs,
    where=age_diffs[1] <= age_diffs[-1],
    facecolor='#3eb64e', alpha=0.4, interpolate=True
)
ax.fill_between(
    *age_diffs, 
    where=age_diffs[1] >= age_diffs[-1], 
    facecolor='#1f77b4', alpha=0.4, interpolate=True
)
ax.set_xlabel('Age (approx)')
ax.set_ylabel('% of Population')
seaborn.despine()
plt.savefig('users_pop.png')

In [None]:
typed.groupby('sex')['id_code'].nunique() / len(set(typed.dropna(subset=['sex'])['id_code']))

In [None]:
seaborn.distplot(typed['anxiety'].dropna())

In [None]:
seaborn.heatmap(typed.corr())

In [None]:
m_f = typed.groupby(
    ['dateofbirth_year', 'sex'])['id_code'].nunique().reset_index(
).pivot(index='dateofbirth_year', columns='sex', values='id_code')

In [None]:
m_f.reset_index().info()

In [None]:
m_f['prop_m'] = m_f[0.0] / (m_f[0.0] + m_f[1.0])
m_f['prop_f'] = 1 - m_f['prop_m']

In [None]:
m_f.columns

In [None]:
m_f.head()

In [None]:
import plotly.graph_objects as go
from plotly.offline import plot

mf_fig = go.Figure(data=[
    go.Bar(
        name='Male', 
        x=m_f.index, y=m_f['prop_m'],
        marker_color='#feffa3'
    ),
    go.Bar(
        name='Female', 
        x=m_f.index, y=m_f['prop_f'],
        marker_color='#a4a3ff'
    ),
    go.Scatter(
        name='50:50',
        x=m_f.index, y=[0.5 for x in m_f.index],
        line={
            'color': '#ff8b94',
            'width': 3
        }
    )
])
mf_fig.update_layout(
    barmode='stack',
    xaxis={
        'title': 'Year of Birth',
        'tickfont_size': 14
    },
    yaxis={
        'title': '% of Total',
        'titlefont_size': 14,
        'tickfont_size': 14,
    },
)

plot(mf_fig, filename='male_female.html')

In [None]:
import plotly.graph_objects as go
from plotly.offline import plot

anx_fig = go.Figure(data=[
    go.Bar(
        name='Anxiety Scores', 
        x=anxiety_hist['anxiety'], y=anxiety_hist['id_code'],
#         marker_color='#feffa3'
    ),
])
anx_fig.update_layout(
    xaxis={
        'title': 'Anxiety',
        'tickfont_size': 14
    },
    yaxis={
        'title': 'Count of Users',
        'titlefont_size': 14,
        'tickfont_size': 14,
    },
)

plot(anx_fig, filename='test.html')