In [134]:
# libraries
import pandas as pd
import numpy as np
from load_data import load_train_data
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm

df, X_valid, y_train, y_valid = load_train_data()
# note X_train is named as df for convenience

### Data Cleaning
- There are a few features which are coded as strings but mix strings and integers/floats (see example below):
    - 'dependency', 
    - 'edjefe', 
    - 'edjefa'

We will need to decide how to handle these.

### Lit review

[Understanding the Determinants of Poverty](https://web.worldbank.org/archive/website01407/WEB/IMAGES/PMCH8.PDF)

- using the highest level of the individuals in the household as the 
household level characteristic. IE, education level of the most highly educated
person in the household

[Introduction to Poverty Analysis](https://documents1.worldbank.org/curated/en/775871468331250546/pdf/902880WP0Box380okPovertyAnalysisEng.pdf)

- p88 - use household head characteristics

[HOUSEHOLD CHARACTERISTICS AND POVERTY: A LOGISTIC REGRESSION ANALYSIS](https://www.jstor.org/stable/23612271?seq=8)

- p310
    - use presence of disability, able-bodied persons, in the household
    - sex ratio in household
    - child/woman ratio in household
    - proportion of female workers to total workers
    - dependency ratio

[Understanding poverty through household and individual level characteristics](https://worldbank.github.io/SARMD_guidelines/note-hhdchars.html)

- "For example, it is not true in general that female-headed households have lower levels of expenditures per capita"
- "It is true, however, that urban households have significantly higher expenditures per capita"

[The DHS Wealth Index](https://dhsprogram.com/pubs/pdf/cr6/cr6.pdf)

- "For this reason, Filmer and Pritchett recommended using principal components analysis
(PCA) to assign the indicator weights, the procedure that is used for the DHS wealth index."

[Poverty and its measurement](https://www.ine.es/en/daco/daco42/sociales/pobreza_en.pdf)

- p8-9 - calculate income per consumption unit rather than per capita

[ARE POOR INDIVIDUALS MAINLY FOUND IN POOR HOUSEHOLDS? EVIDENCE USING NUTRITION DATA FOR AFRICA](https://www.nber.org/system/files/working_papers/w24047/w24047.pdf)

[Moving from the Household to the Individual: Multidimensional Poverty Analysis](https://arxiv.org/ftp/arxiv/papers/1304/1304.5816.pdf)
- "Using longitudinal data Medeiros and Costa (2008) conclude that
feminisation of poverty has not occurred in the eight Latin American countries they
studied. Their findings are invariant to different measures and definitions of poverty."
- "marital status is an important consideration when discussing poverty incidence"

In [135]:
features_to_include = [
    col
    for col in df.columns
    if col
    in [
        "v2a1",
        "hacdor",
        "rooms",
        "hacapo",
        "v14a",
        "refrig",
        "v18q",
        "v18q1",
        "r4h1",
        "r4h2",
        "r4h3",
        "r4m1",
        "r4m2",
        "r4m3",
        "r4t1",
        "r4t2",
        "r4t3",
        "tamhog",
        "tamviv",
        "escolari",
        "rez_esc",
        "hhsize",
        "paredblolad",
        "paredzocalo",
        "paredpreb",
        "pareddes",
        "paredmad",
        "paredzinc",
        "paredfibras",
        "paredother",
        "pisomoscer",
        "pisocemento",
        "pisoother",
        "pisonatur",
        "pisonotiene",
        "pisomadera",
        "techozinc",
        "techoentrepiso",
        "techocane",
        "techootro",
        "cielorazo",
        "abastaguadentro",
        "abastaguafuera",
        "abastaguano",
        "public",
        "planpri",
        "noelec",
        "coopele",
        "sanitario1",
        "sanitario2",
        "sanitario3",
        "sanitario5",
        "sanitario6",
        "energcocinar1",
        "energcocinar2",
        "energcocinar3",
        "energcocinar4",
        "elimbasu1",
        "elimbasu2",
        "elimbasu3",
        "elimbasu4",
        "elimbasu5",
        "elimbasu6",
        "epared1",
        "epared2",
        "epared3",
        "etecho1",
        "etecho2",
        "etecho3",
        "eviv1",
        "eviv2",
        "eviv3",
        "dis",
        "male",
        "female",
        "estadocivil1",
        "estadocivil2",
        "estadocivil3",
        "estadocivil4",
        "estadocivil5",
        "estadocivil6",
        "estadocivil7",
        "parentesco1",
        "parentesco2",
        "parentesco3",
        "parentesco4",
        "parentesco5",
        "parentesco6",
        "parentesco7",
        "parentesco8",
        "parentesco9",
        "parentesco10",
        "parentesco11",
        "parentesco12",
        "hogar_nin",
        "hogar_adul",
        "hogar_mayor",
        "hogar_total",
        "meaneduc",
        "instlevel1",
        "instlevel2",
        "instlevel3",
        "instlevel4",
        "instlevel5",
        "instlevel6",
        "instlevel7",
        "instlevel8",
        "instlevel9",
        "bedrooms",
        "overcrowding",
        "tipovivi1",
        "tipovivi2",
        "tipovivi3",
        "tipovivi4",
        "tipovivi5",
        "computer",
        "television",
        "mobilephone",
        "qmobilephone",
        "lugar1",
        "lugar2",
        "lugar3",
        "lugar4",
        "lugar5",
        "lugar6",
        "area1",
        "area2",
        "age",
        "SQBescolari",
        "SQBage",
        "SQBhogar_total",
        "SQBedjefe",
        "SQBhogar_nin",
        "SQBovercrowding",
        "SQBdependency",
        "SQBmeaned",
        "agesq",
        "Target",
    ]
]
df_subset = df[features_to_include].copy()
df_subset

# Calculate the standard deviation of each column
stds = df_subset[features_to_include].std()
top_st_dev = stds.nlargest(5)
top_st_dev


var_desc = {
    "v2a1": "Monthly rent payment",
    "hacdor": "Overcrowding by bedrooms",
    "rooms": "Number of all rooms in the house",
    "hacapo": "Overcrowding by rooms",
    "v14a": "Has toilet in the household",
    "refrig": "Household has refrigerator",
    "v18q": "Owns a tablet",
    "v18q1": "Number of tablets household owns",
    "r4h1": "Males younger than 12 years of age",
    "r4h2": "Males 12 years of age and older",
    "r4h3": "Total males in the household",
    "r4m1": "Females younger than 12 years of age",
    "r4m2": "Females 12 years of age and older",
    "r4m3": "Total females in the household",
    "r4t1": "Persons younger than 12 years of age",
    "r4t2": "Persons 12 years of age and older",
    "r4t3": "Total persons in the household",
    "tamhog": "Size of the household",
    "tamviv": "TamViv",
    "escolari": "Years of schooling",
    "rez_esc": "Years behind in school",
    "hhsize": "Household size",
    "paredblolad": "Predominant material on the outside wall is block or brick",
    "paredzocalo": "Predominant material on the outside wall is socket (wood, zinc or absbesto)",
    "paredpreb": "Predominant material on the outside wall is prefabricated or cement",
    "pareddes": "Predominant material on the outside wall is waste material",
    "paredmad": "Predominant material on the outside wall is wood",
    "paredzinc": "Predominant material on the outside wall is zink",
    "paredfibras": "Predominant material on the outside wall is natural fibers",
    "paredother": "Predominant material on the outside wall is other",
    "pisomoscer": "Predominant material on the floor is mosaic, ceramic, terrazo",
    "pisocemento": "Predominant material on the floor is cement",
    "pisoother": "Predominant material on the floor is other",
    "pisonatur": "Predominant material on the floor is natural material",
    "pisonotiene": "No floor at the household",
    "pisomadera": "Predominant material on the floor is wood",
    "techozinc": "Predominant material on the roof is metal foil or zink",
    "techoentrepiso": "Predominant material on the roof is fiber cement, mezzanine",
    "techocane": "Predominant material on the roof is natural fibers",
    "techootro": "Predominant material on the roof is other",
    "cielorazo": "House has ceiling",
    "abastaguadentro": "Water provision inside the dwelling",
    "abastaguafuera": "Water provision outside the dwelling",
    "abastaguano": "No water provision",
    "public": "Electricity from CNFL, ICE, ESPH/JASEC",
    "planpri": "Electricity from private plant",
    "noelec": "No electricity in the dwelling",
    "noelec": "no electricity in the dwelling",
    "coopele": "electricity from cooperative",
    "sanitario1": "no toilet in the dwelling",
    "sanitario2": "toilet connected to sewer or cesspool",
    "sanitario3": "toilet connected to septic tank",
    "sanitario5": "toilet connected to black hole or letrine",
    "sanitario6": "toilet connected to other system",
    "energcocinar1": "no main source of energy used for cooking (no kitchen)",
    "energcocinar2": "main source of energy used for cooking electricity",
    "energcocinar3": "main source of energy used for cooking gas",
    "energcocinar4": "main source of energy used for cooking wood charcoal",
    "elimbasu1": "rubbish disposal mainly by tanker truck",
    "elimbasu2": "rubbish disposal mainly by botan hollow or buried",
    "elimbasu3": "rubbish disposal mainly by burning",
    "elimbasu4": "rubbish disposal mainly by throwing in an unoccupied space",
    "elimbasu5": "rubbish disposal mainly by throwing in river, creek or sea",
    "elimbasu6": "rubbish disposal mainly other",
    "epared1": "walls are bad",
    "epared2": "walls are regular",
    "epared3": "walls are good",
    "etecho1": "roof is bad",
    "etecho2": "roof is regular",
    "etecho3": "roof is good",
    "eviv1": "floor is bad",
    "eviv2": "floor is regular",
    "eviv3": "floor is good",
    "dis": "disable person",
    "male": "male",
    "female": "female",
    "estadocivil1": "less than 10 years old",
    "estadocivil2": "free or coupled union",
    "estadocivil3": "married",
    "estadocivil4": "divorced",
    "estadocivil5": "separated",
    "estadocivil6": "widow/er",
    "estadocivil7": "single",
    "parentesco1": "household head",
    "parentesco2": "spouse/partner",
    "parentesco3": "son/daughter",
    "parentesco4": "stepson/daughter",
    "parentesco5": "son/daughter in law",
    "parentesco6": "grandson/daughter",
    "parentesco7": "mother/father",
    "parentesco8": "father/mother in law",
    "parentesco9": "brother/sister",
    "parentesco10": "brother/sister in law",
    "parentesco11": "other family member",
    "parentesco12": "other non-family member",
    "idhogar": "household level identifier",
    "hogar_nin": "number of children 0 to 19 in household",
    "hogar_adul": "number of adults in household",
    "hogar_mayor": "number of individuals 65+ in the household",
    "hogar_total": "number of total individuals in the household",
    "dependency": "dependency rate",
    "edjefe": "years of education of male head of household",
    "edjefa": "years of education of female head of household",
    "meaneduc": "average years of education for adults (18+)",
    "instlevel1": "=1 no level of education",
    "instlevel2": "=1 incomplete primary",
    "instlevel3": "=1 complete primary",
    "instlevel4": "=1 incomplete academic secondary level",
    "instlevel5": "=1 complete academic secondary level",
    "instlevel6": "=1 incomplete technical secondary level",
    "instlevel7": "=1 complete technical secondary level",
    "instlevel8": "=1 undergraduate and higher education",
    "instlevel9": "=1 postgraduate higher education",
    "bedrooms": "number of bedrooms",
    "overcrowding": "# persons per room",
    "tipovivi1": "=1 own and fully paid house",
    "tipovivi2": "=1 own, paying in installments",
    "tipovivi3": "=1 rented",
    "tipovivi4": "=1 precarious",
    "tipovivi5": "=1 other(assigned, borrowed)",
    "computer": "=1 if the household has notebook or desktop computer",
    "television": "=1 if the household has TV",
    "mobilephone": "=1 if mobile phone",
    "qmobilephone": "# of mobile phones",
    "lugar1": "=1 region Central",
    "lugar2": "=1 region Chorotega",
    "lugar3": "=1 region Pacífico central",
    "lugar4": "=1 region Brunca",
    "lugar5": "=1 region Huetar Atlántica",
    "lugar6": "=1 region Huetar Norte",
    "area1": "=1 zona urbana",
    "area2": "=2 zona rural",
    "age": "Age in years",
    "SQBescolari": "escolari squared",
    "SQBage": "age squared",
    "SQBhogar_total": "hogar_total squared",
    "SQBedjefe": "edjefe squared",
    "SQBhogar_nin": "hogar_nin squared",
    "SQBovercrowding": "overcrowding squared",
    "SQBdependency": "dependency squared",
    "SQBmeaned": "meaned squared",
    "agesq": "Age squared",
}

In [136]:
df_subset = df_subset.fillna(df_subset.mean())

# Select the target column and the other columns of interest
target_col = "Target"
other_cols = [
    "v2a1",
    "hacdor",
    "rooms",
    "hacapo",
    "v14a",
    "refrig",
    "v18q",
    "v18q1",
    "r4h1",
    "r4h2",
    "r4h3",
    "r4m1",
    "r4m2",
    "r4m3",
    "r4t1",
    "r4t2",
    "r4t3",
    "tamhog",
    "tamviv",
    "escolari",
    "rez_esc",
    "hhsize",
    "paredblolad",
    "paredzocalo",
    "paredpreb",
    "pareddes",
    "paredmad",
    "paredzinc",
    "paredfibras",
    "paredother",
    "pisomoscer",
    "pisocemento",
    "pisoother",
    "pisonatur",
    "pisonotiene",
    "pisomadera",
    "techozinc",
    "techoentrepiso",
    "techocane",
    "techootro",
    "cielorazo",
    "abastaguadentro",
    "abastaguafuera",
    "abastaguano",
    "public",
    "planpri",
    "noelec",
    "coopele",
    "sanitario1",
    "sanitario2",
    "sanitario3",
    "sanitario5",
    "sanitario6",
    "energcocinar1",
    "energcocinar2",
    "energcocinar3",
    "energcocinar4",
    "elimbasu1",
    "elimbasu2",
    "elimbasu3",
    "elimbasu4",
    "elimbasu5",
    "elimbasu6",
    "epared1",
    "epared2",
    "epared3",
    "etecho1",
    "etecho2",
    "etecho3",
    "eviv1",
    "eviv2",
    "eviv3",
    "dis",
    "male",
    "female",
    "estadocivil1",
    "estadocivil2",
    "estadocivil3",
    "estadocivil4",
    "estadocivil5",
    "estadocivil6",
    "estadocivil7",
    "parentesco1",
    "parentesco2",
    "parentesco3",
    "parentesco4",
    "parentesco5",
    "parentesco6",
    "parentesco7",
    "parentesco8",
    "parentesco9",
    "parentesco10",
    "parentesco11",
    "parentesco12",
    "hogar_nin",
    "hogar_adul",
    "hogar_mayor",
    "hogar_total",
    "meaneduc",
    "instlevel1",
    "instlevel2",
    "instlevel3",
    "instlevel4",
    "instlevel5",
    "instlevel6",
    "instlevel7",
    "instlevel8",
    "instlevel9",
    "bedrooms",
    "overcrowding",
    "tipovivi1",
    "tipovivi2",
    "tipovivi3",
    "tipovivi4",
    "tipovivi5",
    "computer",
    "television",
    "mobilephone",
    "qmobilephone",
    "lugar1",
    "lugar2",
    "lugar3",
    "lugar4",
    "lugar5",
    "lugar6",
    "area1",
    "area2",
    "age",
    "SQBescolari",
    "SQBage",
    "SQBhogar_total",
    "SQBedjefe",
    "SQBhogar_nin",
    "SQBovercrowding",
    "SQBdependency",
    "SQBmeaned",
    "agesq",
]

# Create an empty dataframe to store the regression results
results_df = pd.DataFrame(columns=["variable", "coefficient", "p_value", "r_squared"])

# Iterate over each independent variable in the dataframe
for col in df_subset.columns[:-1]:
    # Fit a linear regression model on the independent variable and target
    X = df_subset[[col]]
    y = y_train["Target"]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()

    # Get the coefficient, p-value, and R-squared for the model
    coeff = model.params[1]
    p_value = model.pvalues[1]
    r_squared = model.rsquared

    # Add the results to the results dataframe
    results_df.loc[len(results_df)] = [col, coeff, p_value, r_squared]

# Add a column with the variable descriptions
results_df["variable_desc"] = results_df["variable"].map(var_desc)

# Print the results dataframe
print(results_df)

            variable  coefficient        p_value  r_squared  \
0               v2a1     0.000002   2.349945e-30   0.017008   
1             hacdor    -0.972788   4.170707e-59   0.033804   
2              rooms     0.153738   2.459432e-88   0.050622   
3             hacapo    -0.895870   3.957915e-33   0.018638   
4               v14a     0.866027   1.164341e-07   0.003667   
..               ...          ...            ...        ...   
131        SQBedjefe     0.003137  2.616150e-106   0.060825   
132     SQBhogar_nin    -0.044599  7.923814e-172   0.097124   
133  SQBovercrowding    -0.063836  1.688727e-119   0.068246   
134    SQBdependency    -0.006762   3.934324e-13   0.006869   
135        SQBmeaned     0.003062  5.851780e-136   0.077426   

                        variable_desc  
0                Monthly rent payment  
1            Overcrowding by bedrooms  
2    Number of all rooms in the house  
3               Overcrowding by rooms  
4         Has toilet in the household  
.. 

In [137]:
# Filter the results where p_value is less than or equal to 0.05
results_df = results_df[results_df["p_value"] <= 0.05]

# Sort the results by r-squared from least to greatest
results_df = results_df.sort_values(by="r_squared", ascending=False)

# Print the results dataframe
print(results_df)

           variable  coefficient        p_value  r_squared  \
98         meaneduc     0.080755  1.120972e-196   0.110527   
94        hogar_nin    -0.242501  6.047454e-193   0.108526   
14             r4t1    -0.300630  1.343977e-174   0.098628   
132    SQBhogar_nin    -0.044599  7.923814e-172   0.097124   
19         escolari     0.063796  1.027866e-157   0.089423   
..              ...          ...            ...        ...   
46           noelec    -0.643940   6.657750e-03   0.000963   
39        techootro     0.692177   1.008356e-02   0.000866   
101      instlevel3    -0.064319   2.317320e-02   0.000674   
37   techoentrepiso     0.190463   3.039188e-02   0.000613   
36        techozinc     0.144453   3.379452e-02   0.000589   

                                         variable_desc  
98         average years of education for adults (18+)  
94             number of children 0 to 19 in household  
14                Persons younger than 12 years of age  
132                        

In [138]:
# Filter the results where p_value is less than or equal to 0.05
results_df = results_df[results_df["p_value"] <= 0.05]

# Sort the results by r-squared from least to greatest
results_df = results_df.sort_values(by="coefficient", ascending=False)

# Print the results dataframe
print(results_df)

        variable  coefficient        p_value  r_squared  \
4           v14a     0.866027   1.164341e-07   0.003667   
117  mobilephone     0.723237   4.267062e-22   0.012147   
107   instlevel9     0.701181   2.202417e-13   0.007017   
39     techootro     0.692177   1.008356e-02   0.000866   
40     cielorazo     0.638890  6.784136e-156   0.088426   
..           ...          ...            ...        ...   
28   paredfibras    -0.946817   1.804639e-03   0.001273   
1         hacdor    -0.972788   4.170707e-59   0.033804   
60     elimbasu4    -0.976908   3.583327e-03   0.001109   
43   abastaguano    -1.212262   7.133514e-08   0.003791   
33     pisonatur    -1.421876   2.228287e-05   0.002350   

                                         variable_desc  
4                          Has toilet in the household  
117                                 =1 if mobile phone  
107                   =1 postgraduate higher education  
39           Predominant material on the roof is other  
40    

In [147]:
results_df.to_csv("regression_results.csv", index=False)

The following represent the split between individual characteristics and household characteristics: 

In [None]:
'''
Household
v2a1,Monthly rent payment
hacdor,=1 Overcrowding by bedrooms
rooms, number of all rooms in the house
hacapo,=1 Overcrowding by rooms
v14a,=1 has toilet in the household
refrig,=1 if the household has refrigerator
v18q1,number of tablets household owns
r4h1,Males younger than 12 years of age
r4h2,Males 12 years of age and older
r4h3,Total males in the household
r4m1,Females younger than 12 years of age
r4m2,Females 12 years of age and older
r4m3,Total females in the household
r4t1,persons younger than 12 years of age
r4t2,persons 12 years of age and older
r4t3,Total persons in the household
tamhog,size of the household
hhsize,household size


paredblolad,=1 if predominant material on the outside wall is block or brick
paredzocalo,"=1 if predominant material on the outside wall is socket (wood, zinc or absbesto"
paredpreb,=1 if predominant material on the outside wall is prefabricated or cement
pareddes,=1 if predominant material on the outside wall is waste material
paredmad,=1 if predominant material on the outside wall is wood 
paredzinc,=1 if predominant material on the outside wall is zink
paredfibras,=1 if predominant material on the outside wall is natural fibers
paredother,=1 if predominant material on the outside wall is other
pisomoscer,"=1 if predominant material on the floor is mosaic, ceramic, terrazo"
pisocemento,=1 if predominant material on the floor is cement
pisoother,=1 if predominant material on the floor is other
pisonatur,=1 if predominant material on the floor is  natural material
pisonotiene,=1 if no floor at the household
pisomadera,=1 if predominant material on the floor is wood
techozinc,=1 if predominant material on the roof is metal foil or zink
techoentrepiso,"=1 if predominant material on the roof is fiber cement, mezzanine "
techocane,=1 if predominant material on the roof is natural fibers
techootro,=1 if predominant material on the roof is other
cielorazo,=1 if the house has ceiling
abastaguadentro,=1 if water provision inside the dwelling
abastaguafuera,=1 if water provision outside the dwelling
abastaguano,=1 if no water provision
public,"=1 electricity from CNFL, ICE, ESPH/JASEC"
planpri,=1 electricity from private plant
noelec,=1 no electricity in the dwelling
coopele,=1 electricity from cooperative
sanitario1,=1 no toilet in the dwelling
sanitario2,=1 toilet connected to sewer or cesspool
sanitario3,=1 toilet connected to  septic tank
sanitario5,=1 toilet connected to black hole or letrine
sanitario6,=1 toilet connected to other system
energcocinar1,=1 no main source of energy used for cooking (no kitchen)
energcocinar2,=1 main source of energy used for cooking electricity
energcocinar3,=1 main source of energy used for cooking gas
energcocinar4,=1 main source of energy used for cooking wood charcoal
elimbasu1,=1 if rubbish disposal mainly by tanker truck
elimbasu2,=1 if rubbish disposal mainly by botan hollow or buried
elimbasu3,=1 if rubbish disposal mainly by burning
elimbasu4,=1 if rubbish disposal mainly by throwing in an unoccupied space
elimbasu5,"=1 if rubbish disposal mainly by throwing in river, creek or sea"
elimbasu6,=1 if rubbish disposal mainly other
epared1,=1 if walls are bad
epared2,=1 if walls are regular
epared3,=1 if walls are good
etecho1,=1 if roof are bad
etecho2,=1 if roof are regular
etecho3,=1 if roof are good
eviv1,=1 if floor are bad
eviv2,=1 if floor are regular
eviv3,=1 if floor are good
idhogar,Household level identifier
hogar_nin,Number of children 0 to 19 in household
hogar_adul,Number of adults in household
hogar_mayor,# of individuals 65+ in the household
hogar_total,# of total individuals in the household
dependency,Dependency rate
edjefe,years of education of male head of household
edjefa,years of education of female head of household
meaneduc,average years of education for adults (18+)
bedrooms,number of bedrooms
overcrowding,# persons per room
tipovivi1,=1 own and fully paid house
tipovivi2,"=1 own, paying in installments"
tipovivi3,=1 rented
tipovivi4,=1 precarious
tipovivi5,"=1 other(assigned, borrowed)"
computer,=1 if the household has notebook or desktop computer
television,=1 if the household has TV
qmobilephone,# of mobile phones
lugar1,=1 region Central
lugar2,=1 region Chorotega
lugar3,=1 region PacÃƒÂ­fico central
lugar4,=1 region Brunca
lugar5,=1 region Huetar AtlÃƒÂ¡ntica
lugar6,=1 region Huetar Norte
area1,=1 zona urbana
area2,=2 zona rural

Individual 
v18q,owns a tablet
tamviv,TamViv
escolari,years of schooling
rez_esc,Years behind in school
dis,=1 if disable person
male,=1 if male
female,=1 if female
estadocivil1,=1 if less than 10 years old
estadocivil2,=1 if free or coupled uunion
estadocivil3,=1 if married
estadocivil4,=1 if divorced
estadocivil5,=1 if separated
estadocivil6,=1 if widow/er
estadocivil7,=1 if single
parentesco1,=1 if household head
parentesco2,=1 if spouse/partner
parentesco3,=1 if son/doughter
parentesco4,=1 if stepson/doughter
parentesco5,=1 if son/doughter in law
parentesco6,=1 if grandson/doughter
parentesco7,=1 if mother/father
parentesco8,=1 if father/mother in law
parentesco9,=1 if brother/sister
parentesco10,=1 if brother/sister in law
parentesco11,=1 if other family member
parentesco12,=1 if other non family member
instlevel1,=1 no level of education
instlevel2,=1 incomplete primary
instlevel3,=1 complete primary
instlevel4,=1 incomplete academic secondary level
instlevel5,=1 complete academic secondary level
instlevel6,=1 incomplete technical secondary level
instlevel7,=1 complete technical secondary level
instlevel8,=1 undergraduate and higher education
instlevel9,=1 postgraduate higher education
mobilephone,=1 if mobile phone
'''

In [139]:
Individual = ['v18q', 'tamviv', 'escolari', 'rez_esc', 'dis', 'male', 'female', 'estadocivil1','estadocivil2','estadocivil3','estadocivil4','estadocivil5','estadocivil6',\
'estadocivil7','parentesco1','parentesco2','parentesco3','parentesco4', 'parentesco5', 'parentesco6', 'parentesco7',\
'parentesco8', 'parentesco9', 'parentesco10', 'parentesco11' ,'parentesco12', 'instlevel1', 'instlevel2', 'instlevel3', 'instlevel6', 'instlevel7', 'instlevel8', 'instlevel9', 'mobilephone']

Household = [
    'v2a1','hacdor',
'rooms', 'hacapo','v14a', 'refrig', 'v18q1', 'r4h1', 'r4h2', 'r4h3', 'r4m1', 'r4m2','r4m3','r4t1', 'r4t2','r4t3','tamhog','hhsize',
'paredblolad','paredzocalo','paredpreb','pareddes','paredmad','paredzinc','paredfibras','paredother','pisomoscer','pisocemento',
'pisoother','pisonatur','pisonotiene','pisomadera','techozinc', 'techoentrepiso', 'techocane', 'techootro','cielorazo','abastaguadentro',
'abastaguafuera','abastaguano', 'public','planpri','noelec','coopele','sanitario1','sanitario2','sanitario3','sanitario5','sanitario6','energcocinar1',
'energcocinar2','energcocinar3','energcocinar4','elimbasu1','elimbasu2','elimbasu3','elimbasu4','elimbasu5','elimbasu6','epared1','epared2','epared3','etecho1',
'etecho2','etecho3','eviv1','eviv2','eviv3',#'idhogar',
'hogar_nin','hogar_adul','hogar_mayor','hogar_total',#'dependency',#'edjefe','edjefa',
'meaneduc','bedrooms','overcrowding','tipovivi1','tipovivi2','tipovivi3','tipovivi4','tipovivi5','computer','television','qmobilephone',
'lugar1','lugar2','lugar3','lugar4','lugar5','lugar6',
'area1','area2']


If we split on these we learn the following:

In [146]:
df_subset2 = df_subset.copy() 
df_subset = df_subset[Household].fillna(df_subset[Household].mean())

# Select the target column and the other columns of interest
target_col = "Target"
other_cols = Household

# Create an empty dataframe to store the regression results
results_df = pd.DataFrame(columns=["variable", "coefficient", "p_value", "r_squared"])

# Iterate over each independent variable in the dataframe
for col in df_subset.columns[:-1]:
    # Fit a linear regression model on the independent variable and target
    X = df_subset[[col]]
    y = y_train["Target"]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()

    # Get the coefficient, p-value, and R-squared for the model
    coeff = model.params[1]
    p_value = model.pvalues[1]
    r_squared = model.rsquared

    # Add the results to the results dataframe
    results_df.loc[len(results_df)] = [col, coeff, p_value, r_squared]

# Add a column with the variable descriptions
results_df["variable_desc"] = results_df["variable"].map(var_desc)

# Filter the results where p_value is less than or equal to 0.05
results_df = results_df[results_df["p_value"] <= 0.05]

# Sort the results by r-squared from least to greatest
results_df = results_df.sort_values(by="r_squared", ascending=False)

# Print the results dataframe
print(results_df)

          variable  coefficient        p_value  r_squared  \
72        meaneduc     0.080755  1.120972e-196   0.110527   
68       hogar_nin    -0.242501  6.047454e-193   0.108526   
13            r4t1    -0.300630  1.343977e-174   0.098628   
36       cielorazo     0.638890  6.784136e-156   0.088426   
74    overcrowding    -0.361997  3.274840e-152   0.086403   
..             ...          ...            ...        ...   
56       elimbasu4    -0.976908   3.583327e-03   0.001109   
42          noelec    -0.643940   6.657750e-03   0.000963   
35       techootro     0.692177   1.008356e-02   0.000866   
33  techoentrepiso     0.190463   3.039188e-02   0.000613   
32       techozinc     0.144453   3.379452e-02   0.000589   

                                        variable_desc  
72        average years of education for adults (18+)  
68            number of children 0 to 19 in household  
13               Persons younger than 12 years of age  
36                                  House h

In [145]:
df_subset = df_subset2.copy()
df_subset = df_subset[Individual].fillna(df_subset[Individual].mean())

# Select the target column and the other columns of interest
target_col = "Target"
other_cols = Individual 

# Create an empty dataframe to store the regression results
results_df = pd.DataFrame(columns=["variable", "coefficient", "p_value", "r_squared"])

# Iterate over each independent variable in the dataframe
for col in df_subset.columns[:-1]:
    # Fit a linear regression model on the independent variable and target
    X = df_subset[[col]]
    y = y_train["Target"]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()

    # Get the coefficient, p-value, and R-squared for the model
    coeff = model.params[1]
    p_value = model.pvalues[1]
    r_squared = model.rsquared

    # Add the results to the results dataframe
    results_df.loc[len(results_df)] = [col, coeff, p_value, r_squared]

# Add a column with the variable descriptions
results_df["variable_desc"] = results_df["variable"].map(var_desc)

df_subset=df_subset2.copy()

# Filter the results where p_value is less than or equal to 0.05
results_df = results_df[results_df["p_value"] <= 0.05]

# Sort the results by r-squared from least to greatest
results_df = results_df.sort_values(by="r_squared", ascending=False)

# Print the results dataframe
print(results_df)

        variable  coefficient        p_value  r_squared  \
2       escolari     0.063796  1.027866e-157   0.089423   
0           v18q     0.561255   4.283957e-97   0.055608   
31    instlevel8     0.613720   2.505349e-78   0.044895   
1         tamviv    -0.084752   2.108554e-44   0.025243   
27    instlevel2    -0.420081   1.126248e-43   0.024818   
26    instlevel1    -0.425687   1.074212e-36   0.020731   
7   estadocivil1    -0.394486   3.489173e-32   0.018083   
9   estadocivil3     0.292192   9.319299e-30   0.016656   
32    instlevel9     0.701181   2.202417e-13   0.007017   
19   parentesco6    -0.301802   6.379435e-09   0.004402   
11  estadocivil5    -0.225072   2.087719e-06   0.002941   
4            dis    -0.228945   4.469559e-06   0.002751   
16   parentesco3    -0.102964   1.327795e-05   0.002479   
15   parentesco2     0.126881   1.938825e-05   0.002385   
10  estadocivil4     0.269694   4.014296e-05   0.002204   
14   parentesco1     0.090992   2.529281e-04   0.001751 

This demonstrates that on an individual level the Household characteristics provide a lot more explanitory value than the individual characteristics. Understanding that poverty does not exist in a vacuum-- the conditions of someones family and environment are important factors in understanding their financial status. 