# Import Libraries

In [59]:
# importing libraries
import datetime
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.express as px
import collections
# must be installed manually
# conda install -c conda-forge pycountry
import pycountry
import difflib
from plotly.subplots import make_subplots
# statsmodels needs to be installed, used for trandlines

# Read CSV

In [60]:
# Suicides dataset: https://www.kaggle.com/russellyates88/suicide-rates-overview-1985-to-2016
# used to get suicides data
datset1_url = "https://stojkovski.ch/master.csv"
# Continent dataset: https://pkgstore.datahub.io/JohnSnowLabs/country-and-continent-codes-list/country-and-continent-codes-list-csv_csv/data/b7876b7f496677669644f3d1069d3121/country-and-continent-codes-list-csv_csv.csv
# used to get country continents
datset2_url = "https://stojkovski.ch/country_codes.csv"
# UV Ratiation in 2014 dataset: https://apps.who.int/gho/data/view.main.35300
# used to get UV Radiation in 2014
datset3_url = "https://stojkovski.ch/UV_1.csv"
# read the dataset and store it into a pandas dataframe
df = pd.read_csv(datset1_url)
df2 = pd.read_csv(datset2_url)
df3 = pd.read_csv(datset3_url)

In [61]:
df2

Unnamed: 0,Continent_Name,Continent_Code,Country_Name,Two_Letter_Country_Code,Three_Letter_Country_Code,Country_Number
0,Asia,AS,"Afghanistan, Islamic Republic of",AF,AFG,4.0
1,Europe,EU,"Albania, Republic of",AL,ALB,8.0
2,Antarctica,AN,Antarctica (the territory South of 60 deg S),AQ,ATA,10.0
3,Africa,AF,"Algeria, People's Democratic Republic of",DZ,DZA,12.0
4,Oceania,OC,American Samoa,AS,ASM,16.0
...,...,...,...,...,...,...
257,Africa,AF,"Zambia, Republic of",ZM,ZMB,894.0
258,Oceania,OC,Disputed Territory,XX,,
259,Asia,AS,Iraq-Saudi Arabia Neutral Zone,XE,,
260,Asia,AS,United Nations Neutral Zone,XD,,


# Data Cleaning

In [62]:
# TODO:
# 7 countries removed (<= 3 years of data total)
# 2016 data was removed (few countries had any, those that did often had data missing)
# HDI was removed due to 2/3 missing data
# Continent was added to the dataset using the countrycode package

# the following years are blacklisted beacuse the data contained incomplete or not accurate
blacklisted_years = [2016]
df = df.drop(df[df['year'].isin(blacklisted_years)].index)

# Data Wrangling

In [63]:
# Change wrong country names according to ISO 3166-1
df.loc[df['country'] == 'Czech Republic', 'country'] = 'Czechia'
df.loc[df['country'] == 'Republic of Korea', 'country'] = 'Korea, Republic of'
df.loc[df['country'] == 'Macau', 'country'] = 'Macao'

In [64]:
# Rename columns containing special characters
df.columns.values[9] = 'gdp_for_year_dollar'
df.columns.values[10] = 'gdp_per_capita_dollar'

In [65]:
# Remove comma
# old way
# df['gdp_for_year_dollar'].replace(',', '').astype(float)
# better way
# df['gdp_for_year_dollar'].apply(lambda x: x.replace(',', ''))

In [66]:
# Add country code
country_names = [x.name for x in pycountry.countries] 
best_match=dict()
for country in set(df['country']):
    best_confidence=0
    matching_countries = difflib.get_close_matches(country, country_names)
    for match in matching_countries:
        confidence = difflib.SequenceMatcher(None, match, country).ratio()
        if (confidence > 0.8 and confidence>best_confidence):
            best_confidence=confidence
            best_match[country]  = match
    if (matching_countries is None):
         best_match[country]  = 'Unknown'

best_matchh=dict()
for hey in best_match.keys():
    country1 = best_match.get(hey) 
    country2 = pycountry.countries.get(name=country1)
    best_matchh[hey]=country2.alpha_3
          
# for index, row in df.iterrows():   
#    df.loc[index, 'country code'] = best_matchh.get(df.loc[index, 'country'])
    
for country in set(df['country']):
    cc = best_matchh.get(country)
    df.loc[df['country']==country, 'country code'] = cc

In [67]:
# Add continent

# Slover version
# for index, row in df.iterrows():  
#     cc = df.loc[index, 'country code']
#     continent = df2[df2['Three_Letter_Country_Code'] == cc]['Continent_Name'].iloc[0]
#     df.loc[index, 'continent'] = continent

# Faster version
for country in set(df['country']):
    cc = df.loc[df['country']==country,'country code'].iloc[0]
    continent = df2[df2['Three_Letter_Country_Code'] == cc]['Continent_Name'].iloc[0]
    df.loc[df['country']==country, 'continent'] = continent
    

In [68]:
# suicides_no and population for each year
suicides_and_population_per_year = df.groupby("year")[['suicides_no','population']].sum()
# suicides/100k_pop every year
year_suicidesno_relation =  suicides_and_population_per_year["suicides_no"] / suicides_and_population_per_year["population"] * 100000
# mean of 100k_pop suicides
suicides_mean = year_suicidesno_relation.mean()

In [69]:

fig = go.Figure(data=go.Scatter(x=year_suicidesno_relation.index, y=year_suicidesno_relation.values, line=dict()))

# Horizontal Line (mean of suicides)
fig.add_shape(
        dict(
            type="line",
            x0=suicides_and_population_per_year.index.min(),
            y0=suicides_mean,
            x1=suicides_and_population_per_year.index.max(),
            y1=suicides_mean,
            line=dict(
                color="MediumPurple",
                width=4,
                dash="dot",
            )
))

fig.update_layout(
    title=dict(
        text='Global suicides (per 100k people)',
        y=0.9,
        x=0.5,
        xanchor='center',
        yanchor='top'
    ),
    xaxis=dict(
        title='Year',
        linecolor='rgb(204, 204, 204)',
    ),
    yaxis=dict(
        title='Suicides per 100k',
    )
)

fig.show()


In [70]:
continent_df = df[['country','year','country code','continent','suicides/100k pop']]
continent_df = continent_df.groupby(['year','continent'])['suicides/100k pop'].mean().reset_index()
africa_df = continent_df[continent_df['continent']=='Africa']
asia_df = continent_df[continent_df['continent']=='Asia']
europe_df = continent_df[continent_df['continent']=='Europe']
namerica_df = continent_df[continent_df['continent']=='North America']
oceania_df = continent_df[continent_df['continent']=='Oceania']
samerica_df = continent_df[continent_df['continent']=='South America']

In [71]:
fig = make_subplots(rows=3,
                    cols=2,
                    subplot_titles=("Europe", "Asia", "Africa", "Oceania", "North America", "South America"),
                    shared_xaxes=True,
                    x_title="Year",
                    y_title="Suicide rate (per 100k people)"
                   )

fig.append_trace(go.Scatter(
    name="Europe",
    x=europe_df['year'],
    y=europe_df['suicides/100k pop'],
), row=1, col=1)

fig.append_trace(go.Scatter(
    name="Asia",
    x=asia_df['year'],
    y=asia_df['suicides/100k pop'],
), row=1, col=2)

fig.append_trace(go.Scatter(
    name="Africa",
    x=africa_df['year'],
    y=africa_df['suicides/100k pop'],
), row=2, col=1)

fig.append_trace(go.Scatter(
    name="Oceania",
    x=oceania_df['year'],
    y=oceania_df['suicides/100k pop'],
), row=2, col=2)

fig.append_trace(go.Scatter(
    name="North America",
    x=namerica_df['year'],
    y=namerica_df['suicides/100k pop'],
), row=3, col=1)

fig.append_trace(go.Scatter(
    name="South America",
    x=samerica_df['year'],
    y=samerica_df['suicides/100k pop'],
), row=3, col=2)


fig.update_layout(height=700, width=900,
        title=dict(
            text='Trends over time by Continent (per 100k people)',
            y=0.95,
            x=0.5,
            xanchor='center',
            yanchor='top'
        ),
)

# Update xaxis properties
fig.update_xaxes( type="log",row=1, col=1)
fig.update_xaxes( type="log", row=1, col=2)
fig.update_xaxes( type="log", row=2, col=1)
fig.update_xaxes( type="log", row=2, col=2)
fig.update_xaxes( type="log", row=3, col=1)
fig.update_xaxes( type="log", row=3, col=2)

# Update yaxis properties
#fig.update_yaxes(title_text="Suicide rate", row=1, col=1)
#fig.update_yaxes(title_text="Suicide rate", row=1, col=2)
#fig.update_yaxes(title_text="Suicide rate", row=2, col=1)
#fig.update_yaxes(title_text="Suicide rate", row=2, col=2)
#fig.update_yaxes(title_text="Suicide rate", row=3, col=1)
#fig.update_yaxes(title_text="Suicide rate", row=3, col=2)


fig.show()

In [72]:
suicides_per_country = df.groupby(['country'])['suicides/100k pop'].mean()
suicides_per_country.sort_values(ascending=False, inplace=True)

In [73]:
fig = go.Figure(go.Bar(
            x=suicides_per_country.values[10::-1],
            y=suicides_per_country.index[10::-1],
            orientation='h',
            text=suicides_per_country.values[10::-1]))
fig.update_layout(
    title=dict(
        text='Top 10 most impacted countries (per 100k people)',
        y=0.9,
        x=0.5,
        xanchor='center',
        yanchor='top'
    ),
    xaxis=dict(
        title='Suicides',
        linecolor='rgb(204, 204, 204)',
    ),
    yaxis=dict(
        title='Country',
    )
)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [74]:
fig = go.Figure(go.Bar(
            x=suicides_per_country.values[-10::1],
            y=suicides_per_country.index[-10::1],
            orientation='h',
            text=suicides_per_country.values[-10::1]))
fig.update_layout(
    title=dict(
        text='Top 10 least impacted countries (per 100k people)',
        y=0.9,
        x=0.5,
        xanchor='center',
        yanchor='top'
    ),
    xaxis=dict(
        title='Suicides',
        linecolor='rgb(204, 204, 204)',
    ),
    yaxis=dict(
        title='Country',
    )
)
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()

In [75]:
world_df = df[['country','year','country code','continent','suicides/100k pop']]
world_df = world_df.groupby(['country','country code','continent'])['suicides/100k pop'].mean().reset_index()

fig = px.scatter_geo(world_df, locations="country code", color="continent",
                     hover_name="country", size="suicides/100k pop",
                     projection="natural earth")
fig.update_layout(
    title=dict(
        text='Suicides map (100k people mean)',
        y=0.95,
        x=0.5,
        xanchor='center',
        yanchor='top'
    ),
)

fig.show()

In [76]:
sex_age_df=df[['age','sex','suicides_no','suicides/100k pop']]
sex_age_df = sex_age_df.groupby(['age','sex'])['suicides/100k pop'].mean().reset_index()
sex_age_df.sort_values(ascending=True, inplace=True, by='suicides/100k pop')

males= sex_age_df.query('sex == "male"')
females= sex_age_df.query('sex == "female"')

males_list=list(males['age'])
females_list=list(females['age'])

In [77]:
fig = go.Figure(data=[
    go.Bar(name='Males', x=males_list, y=males['suicides/100k pop'], text=males['suicides/100k pop']),
    go.Bar(name='Females', x=females_list, y=females['suicides/100k pop'], text=females['suicides/100k pop'])
])
# Change the bar mode
fig.update_layout(barmode='group')
fig.update_traces(texttemplate='%{text:.2s}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.update_layout(
    title=dict(
        text='Age Distribution',
        y=0.90,
        x=0.5,
        xanchor='center',
        yanchor='top'
    ),
    xaxis=dict(
        title='Age Range',
        linecolor='rgb(204, 204, 204)',
    ),
    yaxis=dict(
        title='Suicides (per 100k people)',
    )
)
fig.show()

In [78]:
sex_df=df.groupby(['sex'])['suicides/100k pop'].mean().reset_index()

In [79]:
fig = px.pie(sex_df, values='suicides/100k pop', names='sex',
    title=dict(
        text='Sex Distribution',
        y=0.90,
        x=0.5,
        xanchor='center',
        yanchor='top'
    ),
             hover_data=['suicides/100k pop'])
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [80]:
scatter = df.groupby(['country','continent'])[['gdp_per_capita_dollar', 'suicides/100k pop']].mean().reset_index()

In [81]:
fig = px.scatter(scatter, x="gdp_per_capita_dollar", y="suicides/100k pop", trendline='ols')
fig.show()

In [82]:
hdi_not_null_df = df[df['HDI for year'].notnull()]
scatter2 = hdi_not_null_df.groupby(['country','continent'])[['HDI for year', 'suicides/100k pop']].mean().reset_index()

In [83]:
fig = px.scatter(scatter2, x="HDI for year", y="suicides/100k pop", color='continent',trendline='ols')
fig.show()

In [27]:
only_2004=df.query('year == "2004"')
for country in only_2004['country'].unique():
    if df3[df3['Country'] == country]['Country'].any():
        only_2004.loc[only_2004['country'] == country, 'UV radiation']=df3[df3['Country'] == country]['UV radiation'].iloc[0]
        
only_2004_uv_not_null = only_2004.loc[only_2004['UV radiation'].notnull()]        

In [28]:
only_2004_uv_not_null
scatter3 = only_2004_uv_not_null.groupby(['country','continent','UV radiation'])[['suicides/100k pop']].mean().reset_index()

In [88]:
fig = px.scatter(scatter3, x="UV radiation", y="suicides/100k pop", trendline='ols')
fig.show()

In [85]:
only_2004

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year_dollar,gdp_per_capita_dollar,generation,country code,continent,UV radiation
180,Albania,2004,male,75+ years,4,35526,11.26,Albania2004,,7314865176,2544,Silent,ALB,Europe,2542.0
181,Albania,2004,male,35-54 years,39,391767,9.95,Albania2004,,7314865176,2544,Boomers,ALB,Europe,2542.0
182,Albania,2004,male,25-34 years,16,203938,7.85,Albania2004,,7314865176,2544,Generation X,ALB,Europe,2542.0
183,Albania,2004,female,15-24 years,20,292268,6.84,Albania2004,,7314865176,2544,Millenials,ALB,Europe,2542.0
184,Albania,2004,male,15-24 years,19,286768,6.63,Albania2004,,7314865176,2544,Millenials,ALB,Europe,2542.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27731,Uzbekistan,2004,female,25-34 years,69,2003119,3.44,Uzbekistan2004,,12030023548,516,Generation X,UZB,Asia,3172.0
27732,Uzbekistan,2004,female,35-54 years,70,2761019,2.54,Uzbekistan2004,,12030023548,516,Boomers,UZB,Asia,3172.0
27733,Uzbekistan,2004,female,55-74 years,18,880059,2.05,Uzbekistan2004,,12030023548,516,Silent,UZB,Asia,3172.0
27734,Uzbekistan,2004,male,5-14 years,29,3167129,0.92,Uzbekistan2004,,12030023548,516,Millenials,UZB,Asia,3172.0
