The analysis here refers to the Factor Analysis section of the following paper:

[The sustainable development oxymoron: quantifying and modelling the incompatibility of sustainable development goals](https://www.tandfonline.com/doi/full/10.1080/13504509.2016.1235624)

### Import Modules

In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Import Data

In [32]:
df_raw = pd.read_csv('data/oxymoron/SDG_paper_data_all_raw.csv')
df_raw.head()

Unnamed: 0,Row.names,country,year,civil_libert,polit_rights,freedom_press,democ,autoc,polity,polity2,...,RL.PER.RNK,RL.STD.ERR,RQ.EST,RQ.NO.SRC,RQ.PER.RNK,RQ.STD.ERR,VA.EST,VA.NO.SRC,VA.PER.RNK,VA.STD.ERR
0,Afghanistan.1980,Afghanistan,1980,7.0,7.0,,,,,,...,,,,,,,,,,
1,Afghanistan.1981,Afghanistan,1981,7.0,7.0,,,,,,...,,,,,,,,,,
2,Afghanistan.1982,Afghanistan,1982,7.0,7.0,,,,,,...,,,,,,,,,,
3,Afghanistan.1983,Afghanistan,1983,7.0,7.0,,,,,,...,,,,,,,,,,
4,Afghanistan.1984,Afghanistan,1984,7.0,7.0,,,,,,...,,,,,,,,,,


Unnamed: 0,Row.names,country,year,civil_libert,polit_rights,freedom_press,democ,autoc,polity,polity2,...,RL.PER.RNK,RL.STD.ERR,RQ.EST,RQ.NO.SRC,RQ.PER.RNK,RQ.STD.ERR,VA.EST,VA.NO.SRC,VA.PER.RNK,VA.STD.ERR
0,Afghanistan.1980,Afghanistan,1980,7.0,7.0,,,,,,...,,,,,,,,,,
1,Afghanistan.1981,Afghanistan,1981,7.0,7.0,,,,,,...,,,,,,,,,,
2,Afghanistan.1982,Afghanistan,1982,7.0,7.0,,,,,,...,,,,,,,,,,
3,Afghanistan.1983,Afghanistan,1983,7.0,7.0,,,,,,...,,,,,,,,,,
4,Afghanistan.1984,Afghanistan,1984,7.0,7.0,,,,,,...,,,,,,,,,,


In [33]:
df = pd.read_csv('data/oxymoron/SDG_paper_data.csv')
df.head()

Unnamed: 0,country,year,Row.names,civil_libert,polit_rights,freedom_press,polity2,durable,INTTOT,CIVTOT,...,trade_freedom.resc,TX.VAL.AGRI.ZS.UN.resc,WORKER.resc,government_spending.resc,IT.NET.USER.P2.resc,RL.EST.resc,NE.CON.TOTL.CD.resc,NY.ADJ.DRES.GN.ZS.resc,NE.CON.TETC.CD.resc,NY.GDS.TOTL.CD.resc
0,Afghanistan,1980,Afghanistan-1980,7.0,7.0,,,0.0,0.0,7.0,...,,,,,,,,0.078607,,
1,Afghanistan,1981,Afghanistan-1981,7.0,7.0,,,0.0,0.0,7.0,...,,,1.0,,,,,0.074783,,
2,Afghanistan,1982,Afghanistan-1982,7.0,7.0,,,0.0,0.0,7.0,...,,,1.0,,,,,0.072438,,
3,Afghanistan,1983,Afghanistan-1983,7.0,7.0,,,0.0,0.0,7.0,...,,,1.0,,,,,0.070094,,
4,Afghanistan,1984,Afghanistan-1984,7.0,7.0,,,0.0,0.0,7.0,...,,,1.0,,,,,0.067749,,


Unnamed: 0,country,year,Row.names,civil_libert,polit_rights,freedom_press,polity2,durable,INTTOT,CIVTOT,...,trade_freedom.resc,TX.VAL.AGRI.ZS.UN.resc,WORKER.resc,government_spending.resc,IT.NET.USER.P2.resc,RL.EST.resc,NE.CON.TOTL.CD.resc,NY.ADJ.DRES.GN.ZS.resc,NE.CON.TETC.CD.resc,NY.GDS.TOTL.CD.resc
0,Afghanistan,1980,Afghanistan-1980,7.0,7.0,,,0.0,0.0,7.0,...,,,,,,,,0.078607,,
1,Afghanistan,1981,Afghanistan-1981,7.0,7.0,,,0.0,0.0,7.0,...,,,1.0,,,,,0.074783,,
2,Afghanistan,1982,Afghanistan-1982,7.0,7.0,,,0.0,0.0,7.0,...,,,1.0,,,,,0.072438,,
3,Afghanistan,1983,Afghanistan-1983,7.0,7.0,,,0.0,0.0,7.0,...,,,1.0,,,,,0.070094,,
4,Afghanistan,1984,Afghanistan-1984,7.0,7.0,,,0.0,0.0,7.0,...,,,1.0,,,,,0.067749,,


### Check availability of targets that were mentioned in the paper within these datasets

In [34]:
target_labels = [
    'Poverty',
    'Hunger',
    'Child.Mortality',
    'Education',
    'Women.Parliament',
    'Water',
    'Alternative.Energy',
    'Youth.Unemployment',
    'Internet',
    'GINI',
    'Sanitation',
    'Air.Pollution',
    'CO2.emissions',
    'Protected.Sea',
    'Protected.Land',
    'Violence']

target_columns = [
    'SI.POV.DDAY',
    'SN.ITK.DEFC.ZS',
    'SH.DYN.MORT',
    'SE.SEC.ENRR',
    'SG.GEN.PARL.ZS',
    'SH.H2O.SAFE.ZS',
    'EG.USE.COMM.CL.ZS',
    'SL.UEM.1524.ZS',
    'IT.NET.USER.P2',
    'SI.POV.GINI',
    'SH.STA.ACSN',
    'SH.STA.AIRP.P5',
    'EN.ATM.CO2E.PC',
    'ER.MRN.PTMR.ZS',
    'ER.LND.PTLD.ZS',
    'VC.IHR.PSRC.P5']

goals_df = pd.DataFrame([target_labels,target_columns]).T
goals_df.columns = ['target_label', 'target_column']
goals_df.index = range(1,17)
goals_df

columns = list(df.columns)
goals_df['Available?'] = [target_column in columns for target_column in goals_df['target_column']]

columns_raw = list(df_raw.columns)
goals_df['Available in Raw?'] = [target_column in columns_raw for target_column in goals_df['target_column']]

goals_df

Unnamed: 0,target_label,target_column,Available?,Available in Raw?
1,Poverty,SI.POV.DDAY,False,True
2,Hunger,SN.ITK.DEFC.ZS,False,True
3,Child.Mortality,SH.DYN.MORT,True,True
4,Education,SE.SEC.ENRR,True,True
5,Women.Parliament,SG.GEN.PARL.ZS,True,True
6,Water,SH.H2O.SAFE.ZS,True,True
7,Alternative.Energy,EG.USE.COMM.CL.ZS,True,True
8,Youth.Unemployment,SL.UEM.1524.ZS,True,True
9,Internet,IT.NET.USER.P2,True,True
10,GINI,SI.POV.GINI,False,True


Unnamed: 0,target_label,target_column,Available?,Available in Raw?
1,Poverty,SI.POV.DDAY,False,True
2,Hunger,SN.ITK.DEFC.ZS,False,True
3,Child.Mortality,SH.DYN.MORT,True,True
4,Education,SE.SEC.ENRR,True,True
5,Women.Parliament,SG.GEN.PARL.ZS,True,True
6,Water,SH.H2O.SAFE.ZS,True,True
7,Alternative.Energy,EG.USE.COMM.CL.ZS,True,True
8,Youth.Unemployment,SL.UEM.1524.ZS,True,True
9,Internet,IT.NET.USER.P2,True,True
10,GINI,SI.POV.GINI,False,True


## Processing
### Add indices extracted from SDG_paper_data into the un-indexed mydata and save
This output will be used as the input to the R script factorAnalysis.R

In [35]:
mydata = pd.read_csv('data/oxymoron/from_RData/mydata_export.csv', index_col='Unnamed: 0').reset_index(drop=True) 

mydata.insert(0,'country',df['country'])
mydata.insert(1,'year',df['year'])

mydata.head()

Unnamed: 0,country,year,CO2.emissions,Air.Pollution,Protected.Land,Education,Women.Parliament,Child.Mortality,Water,GINI,...,Hunger,Violence,Internet,Protected.Sea,Sanitation,Poverty,Alternative.Energy,oneoverco2,oneoverpl,oneoverps
0,Afghanistan,1980,0.133543,,,0.832369,,25.5,,,...,,,,,,,,7.432544,,
1,Afghanistan,1981,0.15303,,,0.806355,,24.87,,,...,,,,,,,,6.492247,,
2,Afghanistan,1982,0.166306,,,0.893583,,24.21,,,...,,,,,,,,5.977075,,
3,Afghanistan,1983,0.206086,,,0.88419,,23.51,,,...,,,,,,,,4.8289,,
4,Afghanistan,1984,0.238812,,,0.874798,,22.75,,,...,,,,,,,,4.169937,,


Unnamed: 0,country,year,CO2.emissions,Air.Pollution,Protected.Land,Education,Women.Parliament,Child.Mortality,Water,GINI,...,Hunger,Violence,Internet,Protected.Sea,Sanitation,Poverty,Alternative.Energy,oneoverco2,oneoverpl,oneoverps
0,Afghanistan,1980,0.133543,,,0.832369,,25.5,,,...,,,,,,,,7.432544,,
1,Afghanistan,1981,0.15303,,,0.806355,,24.87,,,...,,,,,,,,6.492247,,
2,Afghanistan,1982,0.166306,,,0.893583,,24.21,,,...,,,,,,,,5.977075,,
3,Afghanistan,1983,0.206086,,,0.88419,,23.51,,,...,,,,,,,,4.8289,,
4,Afghanistan,1984,0.238812,,,0.874798,,22.75,,,...,,,,,,,,4.169937,,


In [36]:
# write this new mydata (which now has country and year indices) to file
mydata.to_csv('outputs/data/for_r/mydata_with_indices.csv')