# Social Capital Analytics Challenge

---

The Social Capital








# Upload and Clean Data

In [None]:
#Import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
!pip install statsmodels
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from statsmodels.imputation import mice
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline



In [None]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Read data (joined w/ county pop. stats to address nulls)
sc = pd.read_csv("/content/drive/MyDrive/Social Capital Data/merged_df.csv")
# Drop columns not needed or wrong data type
sc = sc.drop(columns=['Unnamed: 0', 'county', 'county_name', 'child_ec_se_county', 'ec_high_se_county', 'child_high_ec_se_county', 'ec_se_county' ])

To address missingness of 106 counties population statistics, we sourced cencus data from 2021 and used it to analagously represent population for the year 2018 to best integrate with existing data. Then, we utilized a complex Excel formula to join the missing population statistics for each county, **resulting in null values for 'pop2018' to go from 106 to 0**, with a high level of confidence in our manual imputation.



=IF(ISBLANK(C2), "Missing", IF(ISERROR(VLOOKUP(C2, A:A, 1, FALSE)), "Missing", VLOOKUP(C2, A:B, 2, FALSE)))



Source: https://www.census.gov/data/tables/time-series/demo/popest/2020s-counties-total.html

In [None]:
sc.dtypes

num_below_p50                   float64
pop2018                         float64
ec_county                       float64
child_ec_county                 float64
ec_grp_mem_county               float64
ec_high_county                  float64
child_high_ec_county            float64
ec_grp_mem_high_county          float64
exposure_grp_mem_county         float64
exposure_grp_mem_high_county    float64
child_exposure_county           float64
child_high_exposure_county      float64
bias_grp_mem_county             float64
bias_grp_mem_high_county        float64
child_bias_county               float64
child_high_bias_county          float64
clustering_county               float64
support_ratio_county            float64
volunteering_rate_county        float64
civic_organizations_county      float64
dtype: object

In [None]:
sc.isna().sum()

num_below_p50                     2
pop2018                           0
ec_county                        71
child_ec_county                 360
ec_grp_mem_county                77
ec_high_county                   71
child_high_ec_county            360
ec_grp_mem_high_county           77
exposure_grp_mem_county          77
exposure_grp_mem_high_county     77
child_exposure_county           360
child_high_exposure_county      360
bias_grp_mem_county              77
bias_grp_mem_high_county         77
child_bias_county               360
child_high_bias_county          360
clustering_county                 0
support_ratio_county              0
volunteering_rate_county          0
civic_organizations_county        0
dtype: int64

# Imputation of child metrics

In [None]:
# child_ec_county imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'child_ec_county'  # The variable you wish to impute
    feature_variables = ['support_ratio_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.1462141139530802


In [None]:
# child_high_ec_county imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'child_high_ec_county'  # The variable you wish to impute
    feature_variables = ['support_ratio_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.15864721767449622


In [None]:
# child_exposure_county

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'child_exposure_county'  # The variable you wish to impute
    feature_variables = ['support_ratio_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.1477273992004301


In [None]:
# child_high_exposure_county

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'child_high_exposure_county'  # The variable you wish to impute
    feature_variables = ['support_ratio_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.15377747045777093


In [None]:
# child_bias_count_imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# MAE between child_bias_county and support_ration_county
if __name__ == '__main__':
    data = pd.DataFrame(sc)
    target_variable = 'child_bias_county'
    feature_variables = ['support_ratio_county']
    max_pop_threshold = 100000

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.024329562354170414


In [None]:
# child_high_bias_county imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'child_high_bias_county'  # The variable you wish to impute
    feature_variables = ['support_ratio_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.03581092584877005


# Imputation of ec metrics

In [None]:
# ec_county imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'ec_county'  # The variable you wish to impute
    feature_variables = ['volunteering_rate_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.12899990750009158


In [None]:
# ec_grp_mem_county imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'ec_grp_mem_county'  # The variable you wish to impute
    feature_variables = ['volunteering_rate_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.1612528275648899


In [None]:
# ec_high_county imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'ec_high_county'  # The variable you wish to impute
    feature_variables = ['civic_organizations_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.14676499566871226


In [None]:
# exposure_grp_mem_county imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'exposure_grp_mem_county'  # The variable you wish to impute
    feature_variables = ['civic_organizations_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.17806420308847212


In [None]:
# exposure_grp_mem_high_county imputation

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
import pandas as pd

class NumericImputer:
    def __init__(self, n_neighbors=5):
        self.n_neighbors = n_neighbors
        self.knn_regressor = KNeighborsRegressor(n_neighbors=self.n_neighbors)

    def train_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Ensure feature_variables is a list for consistent handling
        if not isinstance(feature_variables, list):
            feature_variables = [feature_variables]  # Make it a list if it's not

        # Filter the dataframe to create a training set
        training_data = data.dropna(subset=feature_variables + [target_variable])
        training_data = training_data.loc[
            (training_data[target_variable].notnull()) &
            (training_data[feature_variables[0]] <= max_pop_threshold),
            feature_variables + [target_variable]
        ]

        X_train = training_data[feature_variables]
        y_train = training_data[target_variable]

        # Fit the KNN model on the training data
        self.knn_regressor.fit(X_train, y_train)

        return self

    def impute_missing_values(self, data, target_variable, feature_variables, max_pop_threshold):
        # Identify the subset with missing target variable values
        missing_data_subset = data.loc[
            data[target_variable].isnull() & (data[feature_variables[0]] <= max_pop_threshold),
            feature_variables
        ]

        # Predict the target variable for the missing data subset
        predicted_values = self.knn_regressor.predict(missing_data_subset)

        # Impute the predicted values back into the original dataframe
        data.loc[missing_data_subset.index, target_variable] = predicted_values

        return data

    def evaluate_model(self, data, target_variable, feature_variables, max_pop_threshold):
        # Prepare the dataset for cross-validation
        valid_data = data.loc[data[target_variable].notnull() & (data[feature_variables[0]] <= max_pop_threshold)]
        X = valid_data[feature_variables]
        y = valid_data[target_variable]

        # Define a custom scorer function
        def mae_scorer(estimator, X, y):
            predicted_values = estimator.predict(X)
            return -np.mean(np.abs(predicted_values - y))  # Negative MAE

        # Perform 5-Fold cross-validation to estimate the MAE
        cv_scores = cross_val_score(self.knn_regressor, X, y, cv=5, scoring=mae_scorer)

        # Return the average MAE
        return -cv_scores.mean()

# Example usage
if __name__ == '__main__':
    data = pd.DataFrame(sc)  # Your DataFrame here
    target_variable = 'exposure_grp_mem_high_county'  # The variable you wish to impute
    feature_variables = ['support_ratio_county']
    max_pop_threshold = 100000  # Example threshold value

    imputer = NumericImputer(n_neighbors=5)
    imputer.train_model(data, target_variable, feature_variables, max_pop_threshold)
    imputed_data = imputer.impute_missing_values(data, target_variable, feature_variables, max_pop_threshold)
    mae_estimate = imputer.evaluate_model(data, target_variable, feature_variables, max_pop_threshold)

    print('MAE Estimate:', mae_estimate)


MAE Estimate: 0.15402975096665858


In [None]:
sc.isna().sum()

num_below_p50                    2
pop2018                          0
ec_county                        0
child_ec_county                  0
ec_grp_mem_county                0
ec_high_county                   0
child_high_ec_county             0
ec_grp_mem_high_county          77
exposure_grp_mem_county          0
exposure_grp_mem_high_county     0
child_exposure_county            0
child_high_exposure_county       0
bias_grp_mem_county             77
bias_grp_mem_high_county        77
child_bias_county                0
child_high_bias_county           0
clustering_county                0
support_ratio_county             0
volunteering_rate_county         0
civic_organizations_county       0
dtype: int64

# Model Development

In [None]:
# Partition the data
target = sc['child_ec_county']
predictors = sc.drop(['child_ec_county'],axis=1)
predictors_train, predictors_test, target_train, target_test = train_test_split(predictors, target, test_size=0.3, random_state=0)
print(predictors_train.shape, predictors_test.shape, target_train.shape, target_test.shape)

(2162, 19) (927, 19) (2162,) (927,)


In [None]:
 # Ensure predictors_train does not contain missing or infinite values
valid_indices_train = np.isfinite(predictors_train).all(axis=1) & np.isfinite(target_train)
predictors_train_clean = predictors_train[valid_indices_train]
target_train_clean = target_train[valid_indices_train]

# Fit linear regression model with the cleaned training data
model1 = linear_model.LinearRegression()
model1.fit(predictors_train_clean, target_train_clean)

# Remove rows with missing or infinite values from predictors_test
valid_indices_test = np.isfinite(predictors_test).all(axis=1)
predictors_test_clean = predictors_test[valid_indices_test]
target_test_clean = target_test[valid_indices_test]

In [None]:
# Remove rows with missing or infinite values
valid_indices = np.isfinite(predictors_train).all(axis=1) & np.isfinite(target_train)
predictors_train_clean = predictors_train[valid_indices]
target_train_clean = target_train[valid_indices]

# Show model summary
X2 = sm.add_constant(predictors_train_clean)
y = target_train_clean
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:        child_ec_county   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 1.104e+05
Date:                Fri, 09 Feb 2024   Prob (F-statistic):               0.00
Time:                        06:22:14   Log-Likelihood:                 7526.6
No. Observations:                2106   AIC:                        -1.501e+04
Df Residuals:                    2086   BIC:                        -1.490e+04
Df Model:                          19                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

In [None]:
# Make predictions on the cleaned testing data
prediction_on_test = model1.predict(predictors_test_clean)

In [None]:
# Examine the evaluation results on testing data: MAE and RMSE
MAE = mean_absolute_error(target_test_clean, prediction_on_test)
RMSE = mean_squared_error(target_test_clean, prediction_on_test, squared=False)
print("MAE:", MAE)
print("RMSE:", RMSE)

MAE: 0.004524477213677742
RMSE: 0.006964966950267455
