In [282]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [283]:
answers = pd.read_csv('./data/Answer.csv')
survey_2020 = pd.read_csv('./data/2020-survey.csv')
correct_columns = [1, 2, 3, 5, 6, 7, 8, 9, 13, 17, 18, 19, 30, 32, 33, 34, 93, 97, 99, 100, 117, 118]

In [284]:
unused_questions_removed = survey_2020.columns[survey_2020.columns.str.startswith('q')]
survey_2020 = survey_2020[unused_questions_removed]


## Filter out unrelevant questions

In [286]:
answers = answers[answers['QuestionID'].isin(correct_columns)]

## Create survey DataFrame

We want a data frame where each row will represent one users answers to one survey. We create a empty DataFrame and fill it out with one row for each unique UserID found in the answers table.

In [287]:
surveys = pd.DataFrame(columns=['q1', 'q2', 'q3', 'q5', 'q6', 'q7', 'q8', 'q9', 'q13', 'q17', 'q18', 'q19', 'q30', 'q32', 'q33', 'q34', 'q93', 'q97', 'q99', 'q100', 'q117', 'q118' 'Year'])
user_ids = answers['UserID'].unique()
surveys['UserID'] = user_ids

In order to fill out the survey DataFrame we have to iterate through each row in the answers table and set the values in the corresponding row in surveys.

In [290]:
for answer in answers.iterrows():
    user_id = answer[1][2]
    question_id = 'q' + str(answer[1][3])
    answer_text = answer[1][0]
    year = answer[1][1]
    surveys.at[user_id - 1, question_id] = answer_text
    surveys.at[user_id - 1, 'Year'] = year
        
    

Some questions are very similar but does only exists in some years, we choose to combine these questions.

In [291]:
surveys['q17'] = surveys['q17'].combine_first(surveys['q97'])
surveys['q18'] = surveys['q18'].combine_first(surveys['q99'])
surveys['q19'] = surveys['q19'].combine_first(surveys['q100'])

surveys = surveys.drop(['q97', 'q99', 'q100'], axis = 1)


## Pre process 2020 survey
In order to concat the 2020 survey to our surveys DataFrame we have to do some pre processing.

In [292]:
survey_2020 = survey_2020[['q1', 'q2', 'q3', 'q5', 'q6', 'q7', 'q8', 'q9', 'q13', 'q17', 'q18', 'q19', 'q30', 'q32', 'q33', 'q34']]


## Negative age values
Some age values are negative, there fore those entries will be changed to positive.

In [305]:
surveys['q1'] = surveys['q1'].str.replace('-','')

## Plot mental health issues for males and females

Att göra: Lowercase på alla genders, allt som inte är male eller female ska bli other eller tas bort.

In [331]:
surveys['q2'].sample(20)

3587                    Nonbinary
3744                       Female
2400                      Unicorn
3910                         Male
350                          Male
2747                         male
1545                         Male
1713                         Male
1722                         Male
1145                       Female
1031                         Male
3593    Male (or female, or both)
466                          Male
3811                         Male
3463                       female
76                           Male
2264             Male/genderqueer
3092                         Male
3685                         Male
1284                         Male
Name: q2, dtype: object

In [None]:
data = {'Gender': ['Male','Female', 'Other'],
        'GDP_Per_Capita': [45000,42000,52000,49000,47000]
       }
  
df = pd.DataFrame(data,columns=['Country','GDP_Per_Capita'])
df.plot(x ='Country', y='GDP_Per_Capita', kind = 'bar')
plt.show()