In [108]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import dossier 

### Setup
---

In [109]:
# read and inspect dataset
data = pd.read_parquet(r"C:\Users\andre\OneDrive\Shared\Machine Learning Bootcamp\content\projects\wvs\w7.parquet")
data 

Unnamed: 0,version,doi,A_WAVE,A_YEAR,A_STUDY,B_COUNTRY,B_COUNTRY_ALPHA,C_COW_NUM,C_COW_ALPHA,D_INTERVIEW,...,WVS_Polmistrust_PartyVoter,WVS_LR_MedianVoter,WVS_LibCon_MedianVoter,v2psbars,v2psorgs,v2psprbrch,v2psprlnks,v2psplats,v2xnp_client,v2xps_party
0,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070001,...,62.4342105263158,-999,-999,-999,-999,-999,-999,-999,-999,-999
1,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070002,...,62.4342105263158,-999,-999,-999,-999,-999,-999,-999,-999,-999
2,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070003,...,62.4342105263158,-999,-999,-999,-999,-999,-999,-999,-999,-999
3,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070004,...,,,,,,,,,,
4,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2018,2,20,AND,232,AND,20070005,...,66.9642857142857,-999,-999,-999,-999,-999,-999,-999,-999,-999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94273,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2022,2,909,NIR,202,NIRL,909070443,...,,,,,,,,,,
94274,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2022,2,909,NIR,202,NIRL,909070444,...,,,,,,,,,,
94275,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2022,2,909,NIR,202,NIRL,909070445,...,,,,,,,,,,
94276,5-0-0 (2022-12-23),doi.org/10.14281/18241.20,7,2022,2,909,NIR,202,NIRL,909070446,...,,,,,,,,,,


In [110]:
# retain only columns of interest (from 606 to 37)
df = data[list(dossier.columns_to_retain.keys())]

# map survey items to readable names (e.g., Q260 > sex) 
df.columns = df.columns.map(dossier.columns_to_retain)
df

Unnamed: 0,country,interview,mode,settlment,urbrural,respint,literacy,intprivacy,weight,s018,...,donated_campaign,contacted_government,encouraged_political_action,encouraged_voting,fosters_political_knowledge,signed_petitions_online,encouraged_political_action_online,organized_political_events,voted_locally,voted_nationally
0,20,20070001,2,1,1,1,-4,1,1,0.9960159,...,3,3,3,3,3,3,3,3,1,1
1,20,20070002,2,2,1,1,-4,1,1,0.9960159,...,1,2,1,1,1,2,2,3,1,1
2,20,20070003,2,2,1,1,-4,1,1,0.9960159,...,1,1,1,2,2,1,2,2,2,2
3,20,20070004,2,2,1,2,-4,1,1,0.9960159,...,2,3,2,2,2,3,3,3,2,2
4,20,20070005,2,2,1,2,-4,1,1,0.9960159,...,1,3,2,2,1,2,2,3,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94273,909,909070443,1,5,2,1,1,1,4.000926545,2.2371365,...,2,2,2,2,2,3,3,3,1,1
94274,909,909070444,3,4,1,-2,-2,-2,4.647935732,2.2371365,...,3,2,3,3,3,2,3,3,3,3
94275,909,909070445,1,5,2,1,1,1,4.848000132,2.2371365,...,3,3,2,2,2,1,2,3,2,1
94276,909,909070446,1,3,1,1,1,1,4.848000132,2.2371365,...,1,3,2,3,2,2,2,3,2,4


### EDA
---

In [111]:
# each item has additional response possibilities where
# -1=don't know, -2=no answer... We'll treat these as missing values

# first, save the total value count of the df
total_values = df.shape[0]
# then, identify the count of alternative responses (contaning "-") 
missing_values = df.map(lambda val: True if "-" in val else False).sum()
# finally, behold result and cross thy fingers
print(r"% of Missing values by variable")
round(((missing_values / total_values) * 100), 2).sort_values(ascending=False)

# this doesn't bode well. We'll need to employ data imputation

% of Missing values by variable


literacy                              58.37
spouse_occupation                     39.48
spouse_employment                     39.18
spouse_education_level                36.72
employment_sector                     24.23
father_occupation                     13.66
mother_education_level                13.05
intprivacy                            12.84
organized_political_events            12.52
encouraged_political_action_online    12.32
signed_petitions_online               11.44
respint                               10.80
fosters_political_knowledge            9.26
father_birth_country                   8.03
mother_birth_country                   7.91
encouraged_political_action            7.41
occupation                             6.89
subjective_social_class                5.77
voted_nationally                       5.71
citizenship                            5.52
father_immigrant                       5.17
mother_immigrant                       4.96
voted_locally                   

In [112]:
# replace alternate responses with nulls
df = df.map(lambda val: np.nan if "-" in val else val)
# transform columns to numeric
df = df.apply(lambda col: pd.to_numeric(col) if col.dtype == "object" else col)
df

In [None]:
# df.info() 
# df.describe()

In [4]:
# weight and s018 (equilibrated weigth) are special variables
# weight adjusts the results of the sample to the population to the target population
# s018 controls the sample size between countries to round them all to N=1000


# does it really make any sense to adjust variables such as year of birth? 

In [5]:
# political engagement is decomposed in the survey between several items
# these items have different scales
# to avoid undue influence of items with higher scales, we normalize the items


In [6]:
# adjust subject with the responses with the weights

In [7]:
# if i delete a subset of the data, then i would need to update S018, since it has been computed on the origianl reponses

In [10]:
# df.age * df.weight * df.s018
