# GEM Data Hackathon - Data Import and Processing
This notebook converts the original R code for data processing into Python.

In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import gc

# Clear memory (Python equivalent of rm(list = ls()) in R)
gc.collect()

0

In [9]:
# Load the data
# Note: Adjust the file path as needed for your system
data = pd.read_csv("/Users/connorraney/gem-data-hackathon/Hackathon_GEM_Data.csv")

# Display basic info about the dataset
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15868 entries, 0 to 15867
Data columns (total 19 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   weight                     15868 non-null  float64
 1   new_entrepreneur           15868 non-null  object 
 2   established_entrepreneur   15868 non-null  object 
 3   knows_entrepreneur         15761 non-null  object 
 4   local_opportunity          13316 non-null  object 
 5   entrepreneurial_skill      14795 non-null  object 
 6   fear_of_failure            15120 non-null  object 
 7   wants_entrepreneurship     12070 non-null  object 
 8   respects_entrepreneurship  12082 non-null  object 
 9   follows_entrepreneurship   12417 non-null  object 
 10  future_startup             15420 non-null  object 
 11  discontinued_business      15827 non-null  object 
 12  is_investor                15820 non-null  object 
 13  gender                     15868 non-null  obj

## Create a function to recode all variables to their labels

In [10]:
def recode_data_labels(data):
    # Create a copy of the original data
    data_labeled = data.copy()
    
    # recode gender
    data_labeled['gender'] = pd.Categorical(
        data['gender'],
        categories=[1, 2],
        ordered=False
    ).rename_categories({1: 'male', 2: 'female'})
    
    # recode age range
    data_labeled['age9c'] = pd.Categorical(
        data['age9c'], 
        categories=[2, 3, 4, 5, 6, 7],
        ordered=True
    ).rename_categories({2: '18-24', 3: '25-34', 4: '35-44', 5: '45-54', 6: '55-64', 7: '65-74'})
    
    # recode household size (special case with numeric values to preserve)
    data_labeled['hhsize'] = data['hhsize'].apply(lambda x: 
        'Refused' if x == -2 else 
        "Don't know" if x == -1 else 
        str(x) if 1 <= x <= 44 else 
        np.nan
    )
    
    # recode income brackets
    income_levels = {
        1: 'Under $15,000',
        2: '$15,000 to under $25,000',
        3: '$25,000 to under $35,000',
        4: '$35,000 to under $50,000',
        5: '$50,000 to under $75,000',
        6: '$75,000 to under $100,000',
        7: '$100,000 to under $150,000',
        8: '$150,000 to under $200,000',
        9: 'Over $200,000'
    }
    data_labeled['ushhinc'] = pd.Categorical(
        data['ushhinc'], 
        categories=list(income_levels.keys()),
        ordered=True
    ).rename_categories(income_levels)
    
    # recode education level
    education_levels = {
        1: 'None/Less than High School',
        2: 'Some High School',
        3: 'Completed High School',
        4: 'Some College/University',
        5: 'Completed College/University',
        6: 'Degree Graduate (Master\'s or PhD)'
    }
    data_labeled['usreduc'] = pd.Categorical(
        data['usreduc'], 
        categories=list(education_levels.keys()),
        ordered=True
    ).rename_categories(education_levels)
    
    # recode race
    race_levels = {
        -2: 'Refused', 
        -1: "Don't know", 
        1: 'White', 
        2: 'Black', 
        3: 'Hispanic', 
        4: 'Other'
    }
    data_labeled['race'] = pd.Categorical(
        data['race'], 
        categories=list(race_levels.keys()),
        ordered=False
    ).rename_categories(race_levels)
    
    # recode region
    region_levels = {
        1: 'New England',
        2: 'New York-New Jersey',
        3: 'Mid-Atlantic',
        4: 'Southeast',
        5: 'Great Lakes',
        6: 'South',
        7: 'Central Midwest',
        8: 'Mountain and Plains',
        9: 'Pacific Southwest',
        10: 'Pacific Northwest'
    }
    data_labeled['region'] = pd.Categorical(
        data['region'], 
        categories=list(region_levels.keys()),
        ordered=False
    ).rename_categories(region_levels)
    
    # recode variables with standard Yes/No patterns
    yes_no_vars = [
        "knowent", "opport", "suskill", "fearfail", "nbgoodc", "nbstatus", 
        "nbmedia", "TEA", "ESTBBUSO", "futsup", "discent", "busang"
    ]
    
    for var in yes_no_vars:
        if var in data.columns:
            data_labeled[var] = pd.Categorical(
                data[var],
                categories=[-2, -1, 0, 1],
                ordered=False
            ).rename_categories({-2: 'Refused', -1: "Don't know", 0: 'No', 1: 'Yes'})
    
    # recode TEANEWPR
    data_labeled['TEANEWPR'] = pd.Categorical(
        data['TEANEWPR'],
        categories=[-2, -1, 1, 2],
        ordered=False
    ).rename_categories({-2: 'Refused', -1: "Don't know", 1: 'Yes', 2: 'No'})
    
    # recode TEANEWPROD
    new_prod_levels = {
        -2: 'Refused',
        -1: "Don't know",
        1: 'No, not new product or service',
        2: 'New to people in the area where you live',
        3: 'New to people in your country',
        4: 'New to the world'
    }
    data_labeled['TEANEWPROD'] = pd.Categorical(
        data['TEANEWPROD'],
        categories=list(new_prod_levels.keys()),
        ordered=False
    ).rename_categories(new_prod_levels)
    
    # recode TEAEXP4C
    tea_exp_levels = {
        -2: 'Refused',
        -1: "Don't know",
        1: 'More than 75%',
        2: '25 to 75%',
        3: 'Under 25%',
        4: 'None'
    }
    data_labeled['TEAEXP4C'] = pd.Categorical(
        data['TEAEXP4C'],
        categories=list(tea_exp_levels.keys()),
        ordered=False
    ).rename_categories(tea_exp_levels)
    
    # recode EB_EXP4C
    eb_exp_levels = {
        -2: 'Refused',
        -1: "Don't know",
        5: 'More than 75%',
        6: '25 to 75%',
        7: 'Under 25%',
        8: 'None'
    }
    data_labeled['EB_EXP4C'] = pd.Categorical(
        data['EB_EXP4C'],
        categories=list(eb_exp_levels.keys()),
        ordered=False
    ).rename_categories(eb_exp_levels)
    
    # recode industry classifications (both TEA and EB use same coding)
    industry_labels = {
        -2: 'NOT CLASSIFIED/MISSING',
        1: 'AGRICULTURE,FORESTRY,FISHING',
        2: 'MINING,CONSTRUCTION',
        3: 'MANUFACTURING',
        4: 'UTILISATION, TRANSPORT, STORAGE',
        5: 'WHOLESALE TRADE',
        6: 'RETAIL TRADE, HOTELS & RESTAURANTS',
        7: 'INFORMATION AND COMMUNICATION',
        8: 'FINANCIAL INTERMEDIATION, REAL ESTATE',
        9: 'PROFESSIONAL SERVICES',
        10: 'ADMINISTRATIVE SERVICES',
        11: 'GOVERNMENT, HEALTH, EDUCATION, SOCIAL SERVICES',
        12: 'PERSONAL/CONSUMER SERVICE ACTIVITIES'
    }
    
    data_labeled['TEAISIC4_1D'] = pd.Categorical(
        data['TEAISIC4_1D'],
        categories=list(industry_labels.keys()),
        ordered=False
    ).rename_categories(industry_labels)
    
    data_labeled['EB_ISIC4_1D'] = pd.Categorical(
        data['EB_ISIC4_1D'],
        categories=list(industry_labels.keys()),
        ordered=False
    ).rename_categories(industry_labels)
    
    # recode exbuscon
    exbuscon_levels = {
        -2: 'Refused',
        -1: "Don't know",
        1: 'Yes',
        2: 'No',
        3: 'Business continued but activities changed'
    }
    data_labeled['exbuscon'] = pd.Categorical(
        data['exbuscon'],
        categories=list(exbuscon_levels.keys()),
        ordered=False
    ).rename_categories(exbuscon_levels)
    
    # recode bafund
    bafund_levels = {
        -3: 'Have not provided funds',
        -2: 'Refused',
        -1: "Don't know"
    }
    data_labeled['bafund'] = pd.Categorical(
        data['bafund'],
        categories=list(bafund_levels.keys()),
        ordered=False
    ).rename_categories(bafund_levels)
    
    # recode barel
    barel_levels = {
        -2: 'Refused',
        -1: "Don't know",
        1: 'Close family member, such as a spouse, brother, child, parent, or grandchild',
        2: 'Some other relative, kin, or blood relation',
        3: 'A work colleague',
        4: 'A friend or neighbor, or',
        5: 'A stranger with a good business idea',
        6: 'Other'
    }
    data_labeled['barel'] = pd.Categorical(
        data['barel'],
        categories=list(barel_levels.keys()),
        ordered=False
    ).rename_categories(barel_levels)
    
    return data_labeled

In [11]:
# Apply the function to the dataset
data = recode_data_labels(data)

# Display the first few rows of the recoded data
data.head()

KeyError: 'age9c'

## Replace "Refused" and "Don't know" values with NaN

In [None]:
# List of variables to clean
variables_to_clean = [
    'knowent', 'opport', 'suskill', 'fearfail', 'nbgoodc', 'nbstatus', 'nbmedia',
    'futsup', 'discent', 'exbuscon', 'busang', 'barel', 'ESTBBUSO', 'TEAEXP4C',
    'EB_EXP4C', 'TEANEWPR', 'TEANEWPROD', 'TEA', 'race'
]

# Replace "Refused" and "Don't know" with NaN for each variable
for var in variables_to_clean:
    if var in data.columns:
        # Check if the variable is categorical
        if pd.api.types.is_categorical_dtype(data[var]):
            # Create a mask for values to replace with NaN
            mask = data[var].isin(['Refused', "Don't know"])
            # Convert to string to handle NaN assignment
            data.loc[mask, var] = np.nan

## Convert specific columns to appropriate data types

In [None]:
# Convert columns to integer where needed
# First replace non-numeric strings with NaN
data['hhsize'] = pd.to_numeric(data['hhsize'], errors='coerce')

# Convert other columns to integers
int_columns = ['TEAJOBGR', 'EB_JOBGR', 'EB_OWNER', 'TEAOWNER', 'EB_JOBNOW', 'TEAJOBNOW']
for col in int_columns:
    if col in data.columns:
        data[col] = pd.to_numeric(data[col], errors='coerce').astype('Int64')  # Int64 handles NaN values

# Convert yrsurv to category
if 'yrsurv' in data.columns:
    data['yrsurv'] = data['yrsurv'].astype('category')

## Rename and reorder columns

In [None]:
# Reorder columns - first put WEIGHT_L, TEA, ESTBBUSO at the beginning
columns = ['WEIGHT_L', 'TEA', 'ESTBBUSO'] + [col for col in data.columns if col not in ['WEIGHT_L', 'TEA', 'ESTBBUSO']]
data = data[columns]

# Rename columns
column_mapping = {
    'yrsurv': 'year',
    'gender': 'gender',
    'age9c': 'age_range',
    'hhsize': 'household_size',
    'ushhinc': 'household_income',
    'usreduc': 'education',
    'race': 'race',
    'region': 'region',
    'knowent': 'knows_entrepreneur',
    'opport': 'local_opportunity',
    'suskill': 'entrepreneurial_skill',
    'fearfail': 'fear_of_failure',
    'nbgoodc': 'wants_entrepreneurship',
    'nbstatus': 'respects_entrepreneurship',
    'nbmedia': 'follows_entrepreneurship',
    'TEA': 'new_entrepreneur',
    'ESTBBUSO': 'established_entrepreneur',
    'TEAOWNER': 'new_entrepreneur_owners',
    'EB_OWNER': 'established_entrepreneur_owners',
    'TEAJOBNOW': 'new_entrepreneur_employees',
    'EB_JOBNOW': 'established_entrepreneur_employees',
    'TEAJOBGR': 'new_entrepreneur_new_jobs',
    'EB_JOBGR': 'established_entrepreneur_new_jobs',
    'TEANEWPR': 'new_entrepreneur_innovation',
    'TEANEWPROD': 'new_entrepreneur_local_innovation',
    'TEAEXP4C': 'new_entrepreneur_external_sales',
    'EB_EXP4C': 'established_entrepreneur_external_sales',
    'TEAISIC4_1D': 'new_entrepreneur_industry',
    'EB_ISIC4_1D': 'established_entrepreneur_industry',
    'futsup': 'future_startup',
    'discent': 'discontinued_business',
    'exbuscon': 'discontinued_business_continuation',
    'busang': 'is_investor',
    'bafund': 'investment',
    'barel': 'investment_relationship',
    'WEIGHT_L': 'weight'
}

# Rename only columns that exist in the dataframe
existing_columns = {k: v for k, v in column_mapping.items() if k in data.columns}
data = data.rename(columns=existing_columns)

## Remove mostly empty columns

In [None]:
# Columns to drop
columns_to_drop = [
    'discontinued_business_continuation', 'investment', 'new_entrepreneur_local_innovation',
    'investment_relationship', 'household_income', 'education', 'new_entrepreneur_industry',
    'established_entrepreneur_industry', 'new_entrepreneur_new_jobs', 'established_entrepreneur_new_jobs',
    'new_entrepreneur_owners', 'established_entrepreneur_owners', 'new_entrepreneur_external_sales',
    'established_entrepreneur_external_sales', 'new_entrepreneur_innovation', 'new_entrepreneur_employees',
    'established_entrepreneur_employees'
]

# Drop only columns that exist in the dataframe
existing_columns_to_drop = [col for col in columns_to_drop if col in data.columns]
data = data.drop(columns=existing_columns_to_drop)

# Display data info after cleaning
data.info()

In [None]:
# Save the processed data
data.to_csv('Hackathon_GEM_Data_Python.csv', index=False)
print("Data saved successfully!")