In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm  # For progress display

This code processes a dataset by selecting specific columns, applying one-hot encoding to the GEO_ID variable, and saving the updated data for further analysis. It combines manually defined and pre-selected features, ensures the inclusion of key identifiers like GEO_ID and Year, and generates indicator variables for geographic data to enhance model usability.

In [4]:
import pandas as pd

# File paths (remember to change selected_features_based_on_what_we're calculated)
selected_features_path = "/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/3_feature_selection/Subset_4.csv"
data_path = "/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/2_feature_standardization/final_data_transformed.csv"
output_path = "/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/4_indicator_variables/geo_indicators.csv"

# Load the selected features file
selected_features_df = pd.read_csv(selected_features_path)
selected_columns = selected_features_df['Features'].tolist()  # Adjust column name if different

# Manually add our desired lags
columns_to_process = ['TOT_HH', 'CIV_POP_18+', 'TOT_CIV_POP', 'CIV_POP_16', 'TOT_HOUSING_UNITS', 'TOT_POPULATION', 'target_TOT_POPULATION']
selected_columns = list(set(selected_columns + columns_to_process))

# Ensure 'GEO_ID' and 'Year' are included in the list of selected columns
selected_columns.extend(['GEO_ID', 'Year'])

# Load the transformed data file with only the selected columns
data_df = pd.read_csv(data_path)
filtered_data_df = data_df[selected_columns]

# Create one-hot encoded columns for the 'GEO_ID' variable
geo_id_dummies = pd.get_dummies(filtered_data_df['GEO_ID'], prefix='GEO_ID')

# Concatenate the one-hot encoded columns with the original filtered DataFrame
filtered_data_with_dummies = pd.concat([filtered_data_df, geo_id_dummies], axis=1)

# Verify that the one-hot encoded columns are added
print("Columns in the DataFrame after adding one-hot encoding for GEO_ID:")
print(filtered_data_with_dummies.columns)

# Save the updated DataFrame to a new CSV file (without filtering out rows where Year is 2010)
filtered_data_with_dummies.to_csv(output_path, index=False)
print(f"Data with GEO_ID indicators saved to '{output_path}'.")


Columns in the DataFrame after adding one-hot encoding for GEO_ID:
Index(['AVG_COMMUTE_TIME', 'P_3PLUS_VEHICLES', 'P_HH_VALUE_1MIL_PLUS',
       'P_WORK_FINANCE', 'P_WORK_INFORMATION', 'TOT_HH', 'P_3_BEDROOMS',
       'TOT_POPULATION', 'P_WORK_PROFESSIONAL', 'P_1_BEDROOM',
       ...
       'GEO_ID_1600000US4837000', 'GEO_ID_1600000US4865000',
       'GEO_ID_1600000US5157000', 'GEO_ID_1600000US5167000',
       'GEO_ID_1600000US5182000', 'GEO_ID_1600000US5363000',
       'GEO_ID_1600000US5370000', 'GEO_ID_1600000US5374060',
       'GEO_ID_1600000US5548000', 'GEO_ID_1600000US5553000'],
      dtype='object', length=128)
Data with GEO_ID indicators saved to '/Users/avakrocheski-meyer/Downloads/TASK: 1119_start_to_finish/4_indicator_variables/geo_indicators.csv'.
