#Introduction

This code represents our attempt to join the things we learner in DataScience class to a real world analysis. We are students from UFRN, from the IT Bachelor Degree course, in a subject with professor Ivanovitch Silva.

We will make an analysis of two datasets, World Happiness Report, available in [World Happiness Report (Kaggle)](https://www.kaggle.com/unsdsn/world-happiness) and Suicide Rates Overview (1985 to 2016) available at [Suicide Rates Overview(Kaggle)](https://www.kaggle.com/russellyates88/suicide-rates-overview-1985-to-2016), trying to merge and plot them.

#Importing and Pre-processing Data

In [None]:
# install the latest version of folium
!pip install folium

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import numpy as np

import folium
import requests
from folium import plugins
import json
import os

In [3]:
!unzip 'happiness-rates.zip'
!unzip 'suicide-rates.zip'

Archive:  happiness-rates.zip
  inflating: 2015.csv                
  inflating: 2016.csv                
  inflating: 2017.csv                
  inflating: 2018.csv                
  inflating: 2019.csv                
Archive:  suicide-rates.zip
  inflating: master.csv              


In [4]:
df_suicides = pd.read_csv("master.csv")
df_suicides.head()

Unnamed: 0,country,year,sex,age,suicides_no,population,suicides/100k pop,country-year,HDI for year,gdp_for_year ($),gdp_per_capita ($),generation
0,Albania,1987,male,15-24 years,21,312900,6.71,Albania1987,,2156624900,796,Generation X
1,Albania,1987,male,35-54 years,16,308000,5.19,Albania1987,,2156624900,796,Silent
2,Albania,1987,female,15-24 years,14,289700,4.83,Albania1987,,2156624900,796,Generation X
3,Albania,1987,male,75+ years,1,21800,4.59,Albania1987,,2156624900,796,G.I. Generation
4,Albania,1987,male,25-34 years,9,274300,3.28,Albania1987,,2156624900,796,Boomers


In [5]:
df_suicides.year.unique()

array([1987, 1988, 1989, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
       2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
       1985, 1986, 1990, 1991, 2012, 2013, 2014, 2015, 2011, 2016])

We want to join the tables by country and year. Unfortunately, there are only two years that appear in both suicide and happiness data - 2015 and 2016.

In [6]:
# Data Happiness
df_2015 = pd.read_csv('2015.csv')
df_2016 = pd.read_csv('2016.csv')

#Cleaning and joining data

In [7]:
# Dropping columns that appear in one df and not in other
df_2016 = df_2016.drop(['Lower Confidence Interval', 'Upper Confidence Interval'], axis=1)
df_2015 = df_2015.drop(['Standard Error'],  axis=1)

# Creating new columns related to year of each df
df_2015['year'] = 2015
df_2016['year'] = 2016

df_happiness = pd.concat([df_2015, df_2016])

In [8]:
#Dropping a column that's already in the dataframe as "country" and "year"
if "country-year" in df_suicides.columns:
  df_suicides.drop(labels="country-year", inplace=True, axis=1)

#Renaming the columns for better visualization
df_suicides.columns = ["country", "year", "sex","age","suicides_no","pop","suicides/100kpop","HDI","GDP_year","GDP_capita","gen"]

# Selecting only 2015 and 2016
df_suicides = df_suicides[df_suicides['year'].isin([2015,2016])]

df_suicides.head()

Unnamed: 0,country,year,sex,age,suicides_no,pop,suicides/100kpop,HDI,GDP_year,GDP_capita,gen
576,Antigua and Barbuda,2015,female,55-74 years,1,6403,15.62,,1364863037,14853,Boomers
577,Antigua and Barbuda,2015,female,15-24 years,0,8561,0.0,,1364863037,14853,Millenials
578,Antigua and Barbuda,2015,female,25-34 years,0,7740,0.0,,1364863037,14853,Millenials
579,Antigua and Barbuda,2015,female,35-54 years,0,15323,0.0,,1364863037,14853,Generation X
580,Antigua and Barbuda,2015,female,5-14 years,0,8239,0.0,,1364863037,14853,Generation Z


In [9]:
#Renaming the columns for better visualization
df_happiness.columns = ["country", "region", "happiness_rank",'happiness_score',
                        "economy_pc","family","life_expect","freedom",
                        "trust","generosity","dystopia","year"]

df_happiness.head()


Unnamed: 0,country,region,happiness_rank,happiness_score,economy_pc,family,life_expect,freedom,trust,generosity,dystopia,year
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015
1,Iceland,Western Europe,2,7.561,1.30232,1.40223,0.94784,0.62877,0.14145,0.4363,2.70201,2015
2,Denmark,Western Europe,3,7.527,1.32548,1.36058,0.87464,0.64938,0.48357,0.34139,2.49204,2015
3,Norway,Western Europe,4,7.522,1.459,1.33095,0.88521,0.66973,0.36503,0.34699,2.46531,2015
4,Canada,North America,5,7.427,1.32629,1.32261,0.90563,0.63297,0.32957,0.45811,2.45176,2015


In [10]:
# Merging the data
df_all = df_happiness.merge(df_suicides, left_on=['country', 'year'], right_on=['country', 'year'])

df_all.head()

Unnamed: 0,country,region,happiness_rank,happiness_score,economy_pc,family,life_expect,freedom,trust,generosity,dystopia,year,sex,age,suicides_no,pop,suicides/100kpop,HDI,GDP_year,GDP_capita,gen
0,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015,male,75+ years,143,275569,51.89,,679289166858,86068,Silent
1,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015,male,55-74 years,264,891482,29.61,,679289166858,86068,Boomers
2,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015,male,35-54 years,242,1248988,19.38,,679289166858,86068,Generation X
3,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015,male,25-34 years,80,586880,13.63,,679289166858,86068,Millenials
4,Switzerland,Western Europe,1,7.587,1.39651,1.34951,0.94143,0.66557,0.41978,0.29678,2.51738,2015,male,15-24 years,62,482708,12.84,,679289166858,86068,Millenials


# Happiest and Saddest Countries

In [None]:
url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
country_shapes = f'{url}/world-countries.json'

In [None]:
replace_values = {'United States':'United States of America',
                  'Tanzania':'United Republic of Tanzania',
                  'Congo (Brazzaville)':'Republic of the Congo',
                  'Congo (Kinshasa)': 'Republic of the Congo',
                  'Hong Kong':'China',
                  'North Cyprus':'Northern Cyprus',
                  'Serbia':'Republic of Serbia'}

df_happiness['country'].replace(replace_values, inplace=True)
df = df_happiness.groupby('country')['happiness_score'].mean()


In [None]:
# Criação do mapa
m = folium.Map(
    zoom_start=5,
)

threshold_scale = np.linspace(df.values.min(),
                              df.values.max(), 10, dtype=float).tolist()

folium.Choropleth(
    #The GeoJSON data to represent the world country
    geo_data=country_shapes,
    #name='Happinnes across the world',
    data=df,
    #The column aceppting list with 2 values; The country name and  the numerical value
    columns=[df.index, df.values],
    key_on='feature.properties.name',
    fill_color='PuRd',
    nan_fill_color='white',
    legend_name='Happinness Score across the world - 2015/2016',
    highlight=True,
    line_color = '#00000000',
    threshold_scale = threshold_scale
).add_to(m)


m

#https://medium.com/swlh/interactive-choropleth-maps-in-python-dd943b99df50

# Relation between suicides and happiness rates

In [11]:
df_grouped = df_all.groupby('country')[['suicides/100kpop','happiness_score']].mean()

fig = px.scatter(df_grouped, x="happiness_score", y="suicides/100kpop", 
                 size='suicides/100kpop',  hover_name=df_grouped.index,
                 color='happiness_score')
fig.show()

In [None]:
df_suicides.loc[df_suicides.country=="Lithuania"]

In [17]:
mean_suicides = df_all['suicides/100kpop'].mean()

most_suidices_index = df_grouped[df_grouped['suicides/100kpop'] > mean_suicides].index
