### Importing packages 

In [3]:
import pandas as pd 
import matplotlib.pyplot as plt 
import numpy as np 

### Reading in the data 

In [4]:
hbsc_data = pd.read_csv("HBSC2018OAed1.csv") 
hbsc_data.head() 

  hbsc_data = pd.read_csv("HBSC2018OAed1.csv")


Unnamed: 0,HBSC,seqno_int,cluster,countryno,region,id1,id2,id3,id4,weight,...,talkstepmo,famhelp,famsup,famtalk,famdec,MBMI,IRFAS,IRRELFAS_LMH,IOTF4,oweight_who
0,2018,100001,100231,8000,AL,10,58,231,321,1.0,...,5,7,6,7,5,17.981667470725,6,2,1.79769313486232e+308,1.79769313486232e+308
1,2018,100002,100238,8000,AL,10,60,238,611,1.0,...,5,7,7,7,7,17.7832495468691,1.79769313486232e+308,1.79769313486232e+308,1.79769313486232e+308,1.79769313486232e+308
2,2018,100004,100066,8000,AL,3,16,66,1225,1.0,...,5,7,7,1,7,24.2439184746877,1.79769313486232e+308,1.79769313486232e+308,1.79769313486232e+308,1.79769313486232e+308
3,2018,100005,100047,8000,AL,3,14,47,1371,1.0,...,5,7,7,7,7,15.0310509869072,1.79769313486232e+308,1.79769313486232e+308,1.79769313486232e+308,1.79769313486232e+308
4,2018,100007,100132,8000,AL,5,30,132,1604,1.0,...,5,2,1,1,1,15.5709342560554,1.79769313486232e+308,1.79769313486232e+308,1.79769313486232e+308,1.79769313486232e+308


In [5]:
hbsc_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244097 entries, 0 to 244096
Columns: 120 entries, HBSC to oweight_who
dtypes: float64(1), int64(10), object(109)
memory usage: 223.5+ MB


### Filtering for our countries of interest
The Boniel-Nissim et al., 2023 study only analyzed 18 out of the 48 total countries that partook in the study. This is because
only these 18 countries answered questions regarding the sleep questionnaire. Under the methods section, subsection "Data and participants" the following 18 countries are listed with their accompanying sample sizes. **Note**: The study uses "Republic of Moldova" while the table shows "Moldova". This has been done to simplify renaming of countries from ID number to country name later): 

Country            | Sample Size 
-------------------|--------------------
Flemish Belgium    | 3761 
French Belgium     | 3285
Canada             | 9122
Czech Republic     | 10,612
Denmark            | 2823 
Estonia            | 4461 
Finland            | 2899
Greece             | 3633 
Hungary            | 3561 
Iceland            | 6198 
Latvia             | 3985 
Netherlands        | 4540 
Norway             | 2572 
Poland             | 4828 
Portugal           | 5413 
Moldova            | 4325 
Scotland           | 4541
Ukraine            | 5983 

We will filter the rows under the `countryno` column to only contain these countries.  

In [6]:
country_dict = {56001 : "Flemish Belgium", 56002: "French Belgium",124000: "Canada", 
                203000: "Czech Republic", 208000: "Denmark", 233000: "Estonia", 
                246000: "Finland", 300000: "Greece", 348000: "Hungary",
                352000: "Iceland", 428000: "Latvia", 528000: "Netherlands",
                578000: "Norway", 616000: "Poland", 620000: "Portugual",  
                498000: "Maldova", 826002: "Scotland", 804000: "Ukraine" 
                } 

def filter_country_ids(countries: dict) -> list:
    """
    Takes a dictionary with country ID numbers as keys and names as values and compiles a list of all the keys to make filtering 
    rows by the specific keys easier. 
    """
    country_ids = []
    for key in countries:
        country_ids.append(key) 
    return country_ids 
    
relevant_country_ids = filter_country_ids(country_dict) 

#Filtering for the 18 relevant countries in hbsc dataset 
hbsc_data = hbsc_data.loc[hbsc_data["countryno"].isin(relevant_country_ids)]
hbsc_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100079 entries, 53 to 244089
Columns: 120 entries, HBSC to oweight_who
dtypes: float64(1), int64(10), object(109)
memory usage: 92.4+ MB


In [7]:
#Renaming values in the countryno column from country ID to country name 

def obtain_country_names(countries: dict) -> list: 
    """
    Takes a dictionary with country ID numbers as keys and names as values and compiles a list of all the values to allow 
    us to identify countries by their names as opposed to their ID's.
    """
    country_names = []
    for key, value in countries.items(): 
        country_names.append(value) 
    return country_names 

relevant_country_names = obtain_country_names(country_dict) 

#Converting names and counting number of occurances 
hbsc_data = hbsc_data.replace(relevant_country_ids, relevant_country_names) 
counts = hbsc_data["countryno"].value_counts()
counts

countryno
Canada             12950
Czech Republic     11564
Iceland             6996
Ukraine             6660
Portugual           6126
French Belgium      5578
Poland              5224
Scotland            5021
Estonia             4725
Netherlands         4698
Maldova             4686
Latvia              4412
Flemish Belgium     4333
Greece              3863
Hungary             3789
Denmark             3181
Finland             3146
Norway              3127
Name: count, dtype: int64

### Further data filteration 
The values above from the filtered dataset do not match the values provided in the paper. We must further filter out no-response
values to try and get values as similar to Boniel-Nissim et al., 2023 study. In particular, we have to get rid of no-response 
values for the Social Media Use questions. In the dataset no-repsonse is indicated either by a value of 99 or 1.79769313486232e+308. The following colunms will be filtered to exclude these values. 

- emconlfreq1  
- emconlfreq2
- emconlfreq3 
- emconlfreq4 
- emconlpref1
- emconlpref2
- emconlpref3
- emcsocmed1
- emcsocmed2
- emcsocmed3
- emcsocmed4
- emcsocmed5
- emcsocmed5
- emcsocmed6
- emcsocmed7
- emcsocmed8
- emcsocmed9

In [67]:
hbsc_data["sleepdificulty"] = pd.to_numeric(hbsc_data["sleepdificulty"], errors='coerce')
hbsc_data["emconlfreq1"] = pd.to_numeric(hbsc_data["emconlfreq1"], errors='coerce')
hbsc_data["emconlfreq2"] = pd.to_numeric(hbsc_data["emconlfreq2"], errors='coerce')
hbsc_data["emconlfreq3"] = pd.to_numeric(hbsc_data["emconlfreq3"], errors='coerce')
hbsc_data["emconlfreq4"] = pd.to_numeric(hbsc_data["emconlfreq4"], errors='coerce')
hbsc_data["emconpref1"] = pd.to_numeric(hbsc_data["emconlpref1"], errors='coerce')
hbsc_data["emconpref2"] = pd.to_numeric(hbsc_data["emconlpref2"], errors='coerce')
hbsc_data["emconpref3"] = pd.to_numeric(hbsc_data["emconlpref3"], errors='coerce')
hbsc_data["emcsocmed1"] = pd.to_numeric(hbsc_data["emcsocmed1"], errors='coerce')
hbsc_data["emcsocmed2"] = pd.to_numeric(hbsc_data["emcsocmed2"], errors='coerce')
hbsc_data["emcsocmed3"] = pd.to_numeric(hbsc_data["emcsocmed3"], errors='coerce')
hbsc_data["emcsocmed4"] = pd.to_numeric(hbsc_data["emcsocmed4"], errors='coerce')
hbsc_data["emcsocmed5"] = pd.to_numeric(hbsc_data["emcsocmed5"], errors='coerce')
hbsc_data["emcsocmed6"] = pd.to_numeric(hbsc_data["emcsocmed6"], errors='coerce')
hbsc_data["emcsocmed7"] = pd.to_numeric(hbsc_data["emcsocmed7"], errors='coerce')
hbsc_data["emcsocmed8"] = pd.to_numeric(hbsc_data["emcsocmed8"], errors='coerce')
hbsc_data["emcsocmed9"] = pd.to_numeric(hbsc_data["emcsocmed9"], errors='coerce')

#Applying indexing to column names 
column_mapping = [f"{x[0]}:{x[1]}" for x in enumerate(hbsc_data.columns)]
column_mapping
# We need to select columns 3, 33, 74-89
selected_columns_dataframe = hbsc_data.iloc[:, np.r_[3, 33, 74:90]]

filter_out_99 = selected_columns_dataframe.loc[
    (selected_columns_dataframe["emconlfreq1"] != 99.0) &
    (selected_columns_dataframe["emconlfreq2"] != 99.0) &
    (selected_columns_dataframe["emconlfreq3"] != 99.0) &
    (selected_columns_dataframe["emconlfreq4"] != 99.0) &
    (selected_columns_dataframe["emconlpref1"] != 99) &
    (selected_columns_dataframe["emconlpref2"] != 99) &
    (selected_columns_dataframe["emconlpref3"] != 99) &
    (selected_columns_dataframe["emcsocmed1"] != 99.0) &
    (selected_columns_dataframe["emcsocmed2"] != 99.0) &
    (selected_columns_dataframe["emcsocmed3"] != 99.0) &
    (selected_columns_dataframe["emcsocmed4"] != 99.0) &
    (selected_columns_dataframe["emcsocmed5"] != 99.0) &
    (selected_columns_dataframe["emcsocmed6"] != 99.0) &
    (selected_columns_dataframe["emcsocmed7"] != 99.0) &
    (selected_columns_dataframe["emcsocmed8"] != 99.0) &
    (selected_columns_dataframe["emcsocmed9"] != 99.0) ]

filter_out_NaN = selected_columns_dataframe.dropna()
hbsc_data_filtered = pd.concat([filter_out_NaN, filter_out_99]).drop_duplicates() 
hbsc_data_filtered.head() 
count = hbsc_data_filtered["countryno"].value_counts() 
count

countryno
Canada             10039
Czech Republic      9098
Ukraine             5648
Iceland             5536
Poland              4737
French Belgium      4678
Portugual           4638
Scotland            4507
Estonia             4303
Maldova             4230
Netherlands         4025
Latvia              3928
Flemish Belgium     3710
Hungary             3493
Greece              3439
Finland             2827
Denmark             2774
Norway              2642
Name: count, dtype: int64