In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split

from IPython.display import Markdown

from charles_acquire import *

## It looks like the columns of `QKEY` & `WEIGHT_W41` are float64 and the rest are categories

In [2]:
def from_categorical_to_object(df: pd.DataFrame, exclude = ['QKEY', 'WEIGHT_W41']) -> pd.DataFrame:
    '''Takes a dataframe and a column name and will iterate through and input into an object
    and then return the dataframe with that column as an object with rows
    '''
    
    # Iterates through dataframe column names and will switch from categorical to object/string
    for col in [name for name in df.columns if name not in exclude]:
            # Targets each individual series
            df[col] = df[col].astype('str')
    
    return df

In [3]:
def train_validate_test_split(df: pd.DataFrame, target: str, seed=123):
    '''This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''

    train_validate, test = train_test_split(df, test_size=0.2, 

                                            random_state=seed, 

                                            stratify=df[target])

    train, validate = train_test_split(train_validate, test_size=0.3, 

                                       random_state=seed,

                                       stratify=train_validate[target])

    return train, validate, test

In [4]:
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    '''Will take the American Trends Dataframe and rename the columns to a less verbose name
    then it will return that new dataframe
    '''
    
    # create column_dict to modify the names of what the columns will be named too
    column_dict = {col: col.lower() for col in df.columns}
    
    # Manual renaming of the columns to a name that is more indicative of the question
    manual_dict = {
        'FTRWORRYa_W41': 'worry_economy',
        'FTRWORRYb_W41': 'worry_public_schools',
        'FTRWORRYc_W41': 'worry_government',
        'FTRWORRYd_W41': 'worry_leaders',
        'FTRWORRYe_W41': 'worry_morals',
        'FTRWORRYf_W41': 'worry_climate',
        'ELDCARE_W41eldcare': 'elder_care',
        'ELDFINANCEF1_W41': 'elder_finance_1',
        'ELDFINANCEF2_W41': 'elder_finance_2',
        'GOVPRIOa_W41': 'priority_debt',
        'GOVPRIOb_W41': 'priority_education',
        'GOVPRIOc_W41': 'priority_healthcare',
        'GOVPRIOd_W41': 'priority_science',
        'GOVPRIOe_W41': 'priority_inequality',
        'GOVPRIOfF1_W41': 'priority_reduce_military',
        'GOVPRIOgF1_W41': 'priority_undocumented_immigration',
        'GOVPRIOhF1_W41': 'priority_reduce_social_security',
        'GOVPRIOjF1_W41': 'priority_increase_spending_infrastructure',
        'GOVPRIOjF1_W41': 'priority_avoid_tax_increase',
        'GOVPRIOkF2_W41': 'priority_increase_military',
        'GOVPRIOlF2_W41': 'priority_more_immigration',
        'GOVPRIOhF1_W41': 'priority_increase_social_security',
        'GOVPRIOnF2_W41': 'priority_reducing_spending_infrastructure',
        'GOVPRIOoF2_W41': 'priority_climate',
        'SOLVPROBa_W41': 'sci_tech',
        'SOLVPROBb_W41': 'major_corps',
        'SOLVPROBc_W41': 'rel_groups',
        'SOLVPROBdF1_W41': 'gov_in_wash',
        'SOLVPROBeF2_W41': 'state_local',
        'SOLVPROBf_W41': 'media',
        'SOLVPROBg_W41': 'military',
        'SOLVPROBh_W41': 'college_uni',
        'SOLVPROBi_W41': 'schools',
        'HARASS1F1a_W41': 'harass_false_f1',
        'HARASS1F1b_W41': 'harass_fired_f1',
        'HARASS1F1c_W41': 'harass_unpunished_f1',
        'HARASS1F1d_W41': 'harass_unbelieved_f1',
        'HARASS1NOWRKF2a_W41': 'harass_false_f2',
        'HARASS1NOWRKF2c_W41': 'harass_unpunished_f2',
        'HARASS1NOWRKF2d_W41': 'harass_unbelieved_f2',
        'HARASS3F1_W41': 'harass_interactions_f1',
        'HARASS3NOWRKF2_W41': 'harass_interactions_f2',
        'HARASS4_W41': 'harass_personal_exp',
        'HARASS5_W41': 'harass_sexual_personal_exp',
        'GNATPROB_W41': 'worries_federal_government',
        'WRKTRN1F1_W41': 'most_responsible_for_workers_f1',
        'WRKTRN2F1_W41': 'most_responsible_for_workers_f2',
        'WRKTRN2F1_W41': 'second_most_responsible_for_workers_f1',
        'WRKTRN2F2_W41': 'second_most_responsible_for_workers_f2',
        'JOBSECURITY_W41': 'job_security',
        'JOBBENEFITS_W41': 'job_benefits',
        'AUTOWKPLC_W41': 'automation_good_or_bad',
        'ROBWRK_W41': 'replacement_by_robots_likelihood',
        'ROBWRK2_W41': 'replacement_by_robots_good_or_bad',
        'AUTOLKLY_W41': 'likelihood_my_job_replaced_by_robots',
        'ROBIMPACTa_W41': 'robot_replacement_increase_inequality',
        'ROBIMPACTb_W41': 'robot_replacement_means_better_jobs_for_humans',
        'LEGALIMG_W41': 'legal_immigration_levels',
        'FUTRCLASSa_W41': 'share_americans_in_upper_class',
        'FUTRCLASSb_W41': 'share_americans_in_middle_class',
        'FUTRCLASSc_W41': 'share_americans_in_lower_class',
        'F_EDUCCAT': 'highest_education_three_categories',
        'F_EDUCCAT2': 'highest_education_six_categories',
        'F_HISP':'hispanic_or_latino',
        'F_RACECMB': 'race', 
        'F_RACETHN': 'race_and_ethnicity',
        'F_NATIVITY': 'birthplace',
        'F_CITIZEN': 'us_citizen',
        'F_MARITAL': 'marital_status',
        'F_RELIG': 'religion',
        'F_BORN': 'evangelical_christian',
        'F_ATTEND': 'church_attendance',
        'F_PARTY_FINAL': 'political_party_identity',
        'F_PARTYLN_FINAL': 'political_party_lean',
        'F_PARTYSUM_FINAL': 'summary_of_political_party_data',
        'F_INCOME': 'family_income_nine_categories',
        'F_INCOME_RECODE': 'family_income_three_categories',
        'F_REG': 'registered_to_vote',
        'F_IDEO': 'political_views',
        'F_INTUSER': 'internet_access',
        'F_VOLSUM': 'volunteer'    
    }
    
    
    for k, v in column_dict.items():
        # Remove the _w41
        v = v.replace('_w41', '').replace('f_', '')
        column_dict[k] = v

In [5]:
def create_dummy_data_from_responses(df: pd.DataFrame, exclude_cols: list) -> (pd.DataFrame, dict, dict):
    '''Takes a dataframe and exclude columns list and will go through each of the values and maps them to
    an associated mapping and will return the mapping for the revert_key and the replacement_key
    '''
    # Remove the columns that you do not want the key-value replacement mapping to be performed on
    columns = [col for col in df.columns if col not in exclude_cols]
    
    # Origin Replacement Key Values - This will revert back to original values/responses
    revert_key_values = dict()
    
    # Replacement Key Values - This will change the values/responses to dummy values
    replacement_key_values = dict()
    
    # Iterate though each column
    for col in columns:
        # Select the series
        s = df[col]
        # Fetch the unique values from the series
        uniques = s.unique()
        
        # Build the series replacement dictionary
        temp_replace_dict = dict()
        
        # Build revert replacement dictionary
        temp_revert_dict = dict()
        
        # Enumerate through the unique values to assign the integers to the value
        for new_val, response in enumerate(uniques):
            # define the key for the replacement dic
            temp_replace_dict[response] = new_val
            # define the key for the replacement 
            temp_revert_dict[new_val] = response
            
        # Define the columns replacement values
        replacement_key_values[col] = temp_replace_dict

        # Define the columns for reversion dicts
        revert_key_values[col] = temp_revert_dict
        
    ##FIXMME##
    df = df.replace(replacement_key_values)
    
    return df, revert_key_values, replacement_key_values
    

In [6]:
def wrangle_data():
    # Get origin data from the folder
    df = get_atp_w41_spss_data()

    # Convert the datafrom Categorical to Object/string
    df = from_categorical_to_object(df)

    # Remove optimisim_refused from the dataframe
    df = df[df.OPTIMIST_W41 != 'Refused']
    
    rename_columns(df)
    return df

In [7]:
manual_dict = {
        'FTRWORRYa_W41': 'worry_economy',
        'FTRWORRYb_W41': 'worry_public_schools',
        'FTRWORRYc_W41': 'worry_government',
        'FTRWORRYd_W41': 'worry_leaders',
        'FTRWORRYe_W41': 'worry_morals',
        'FTRWORRYf_W41': 'worry_climate',
        'ELDCARE_W41eldcare': 'elder_care',
        'ELDFINANCEF1_W41': 'elder_finance_1',
        'ELDFINANCEF2_W41': 'elder_finance_2',
        'GOVPRIOa_W41': 'priority_debt',
        'GOVPRIOb_W41': 'priority_education',
        'GOVPRIOc_W41': 'priority_healthcare',
        'GOVPRIOd_W41': 'priority_science',
        'GOVPRIOe_W41': 'priority_inequality',
        'GOVPRIOfF1_W41': 'priority_reduce_military',
        'GOVPRIOgF1_W41': 'priority_undocumented_immigration',
        'GOVPRIOhF1_W41': 'priority_reduce_social_security',
        'GOVPRIOjF1_W41': 'priority_increase_spending_infrastructure',
        'GOVPRIOjF1_W41': 'priority_avoid_tax_increase',
        'GOVPRIOkF2_W41': 'priority_increase_military',
        'GOVPRIOlF2_W41': 'priority_more_immigration',
        'GOVPRIOhF1_W41': 'priority_increase_social_security',
        'GOVPRIOnF2_W41': 'priority_reducing_spending_infrastructure',
        'GOVPRIOoF2_W41': 'priority_climate',
        'SOLVPROBa_W41': 'sci_tech',
        'SOLVPROBb_W41': 'major_corps',
        'SOLVPROBc_W41': 'rel_groups',
        'SOLVPROBdF1_W41': 'gov_in_wash',
        'SOLVPROBeF2_W41': 'state_local',
        'SOLVPROBf_W41': 'media',
        'SOLVPROBg_W41': 'military',
        'SOLVPROBh_W41': 'college_uni',
        'SOLVPROBi_W41': 'schools',
        'HARASS1F1a_W41': 'harass_false_f1',
        'HARASS1F1b_W41': 'harass_fired_f1',
        'HARASS1F1c_W41': 'harass_unpunished_f1',
        'HARASS1F1d_W41': 'harass_unbelieved_f1',
        'HARASS1NOWRKF2a_W41': 'harass_false_f2',
        'HARASS1NOWRKF2c_W41': 'harass_unpunished_f2',
        'HARASS1NOWRKF2d_W41': 'harass_unbelieved_f2',
        'HARASS3F1_W41': 'harass_interactions_f1',
        'HARASS3NOWRKF2_W41': 'harass_interactions_f2',
        'HARASS4_W41': 'harass_personal_exp',
        'HARASS5_W41': 'harass_sexual_personal_exp',
        'GNATPROB_W41': 'worries_federal_government',
        'WRKTRN1F1_W41': 'most_responsible_for_workers_f1',
        'WRKTRN2F1_W41': 'most_responsible_for_workers_f2',
        'WRKTRN2F1_W41': 'second_most_responsible_for_workers_f1',
        'WRKTRN2F2_W41': 'second_most_responsible_for_workers_f2',
        'JOBSECURITY_W41': 'job_security',
        'JOBBENEFITS_W41': 'job_benefits',
        'AUTOWKPLC_W41': 'automation_good_or_bad',
        'ROBWRK_W41': 'replacement_by_robots_likelihood',
        'ROBWRK2_W41': 'replacement_by_robots_good_or_bad',
        'AUTOLKLY_W41': 'likelihood_my_job_replaced_by_robots',
        'ROBIMPACTa_W41': 'robot_replacement_increase_inequality',
        'ROBIMPACTb_W41': 'robot_replacement_means_better_jobs_for_humans',
        'LEGALIMG_W41': 'legal_immigration_levels',
        'FUTRCLASSa_W41': 'share_americans_in_upper_class',
        'FUTRCLASSb_W41': 'share_americans_in_middle_class',
        'FUTRCLASSc_W41': 'share_americans_in_lower_class',
        'F_EDUCCAT': 'highest_education_three_categories',
        'F_EDUCCAT2': 'highest_education_six_categories',
        'F_HISP':'hispanic_or_latino',
        'F_RACECMB': 'race', 
        'F_RACETHN': 'race_and_ethnicity',
        'F_NATIVITY': 'birthplace',
        'F_CITIZEN': 'us_citizen',
        'F_MARITAL': 'marital_status',
        'F_RELIG': 'religion',
        'F_BORN': 'evangelical_christian',
        'F_ATTEND': 'church_attendance',
        'F_PARTY_FINAL': 'political_party_identity',
        'F_PARTYLN_FINAL': 'political_party_lean',
        'F_PARTYSUM_FINAL': 'summary_of_political_party_data',
        'F_INCOME': 'family_income_nine_categories',
        'F_INCOME_RECODE': 'family_income_three_categories',
        'F_REG': 'registered_to_vote',
        'F_IDEO': 'political_views',
        'F_INTUSER': 'internet_access',
        'F_VOLSUM': 'volunteer'    
    }

In [8]:
tmp = list()
for k,v in manual_dict.items():
    tmp.append({'Original_Column_Name': f'<code>{k}</code>',
                '': '&rarr;',
                'New_Column_Name': f'<code>{v}</code>'})
_ = pd.DataFrame(tmp)

Markdown(_.to_markdown())

|    | Original_Column_Name             |        | New_Column_Name                                             |
|---:|:---------------------------------|:-------|:------------------------------------------------------------|
|  0 | <code>FTRWORRYa_W41</code>       | &rarr; | <code>worry_economy</code>                                  |
|  1 | <code>FTRWORRYb_W41</code>       | &rarr; | <code>worry_public_schools</code>                           |
|  2 | <code>FTRWORRYc_W41</code>       | &rarr; | <code>worry_government</code>                               |
|  3 | <code>FTRWORRYd_W41</code>       | &rarr; | <code>worry_leaders</code>                                  |
|  4 | <code>FTRWORRYe_W41</code>       | &rarr; | <code>worry_morals</code>                                   |
|  5 | <code>FTRWORRYf_W41</code>       | &rarr; | <code>worry_climate</code>                                  |
|  6 | <code>ELDCARE_W41eldcare</code>  | &rarr; | <code>elder_care</code>                                     |
|  7 | <code>ELDFINANCEF1_W41</code>    | &rarr; | <code>elder_finance_1</code>                                |
|  8 | <code>ELDFINANCEF2_W41</code>    | &rarr; | <code>elder_finance_2</code>                                |
|  9 | <code>GOVPRIOa_W41</code>        | &rarr; | <code>priority_debt</code>                                  |
| 10 | <code>GOVPRIOb_W41</code>        | &rarr; | <code>priority_education</code>                             |
| 11 | <code>GOVPRIOc_W41</code>        | &rarr; | <code>priority_healthcare</code>                            |
| 12 | <code>GOVPRIOd_W41</code>        | &rarr; | <code>priority_science</code>                               |
| 13 | <code>GOVPRIOe_W41</code>        | &rarr; | <code>priority_inequality</code>                            |
| 14 | <code>GOVPRIOfF1_W41</code>      | &rarr; | <code>priority_reduce_military</code>                       |
| 15 | <code>GOVPRIOgF1_W41</code>      | &rarr; | <code>priority_undocumented_immigration</code>              |
| 16 | <code>GOVPRIOhF1_W41</code>      | &rarr; | <code>priority_increase_social_security</code>              |
| 17 | <code>GOVPRIOjF1_W41</code>      | &rarr; | <code>priority_avoid_tax_increase</code>                    |
| 18 | <code>GOVPRIOkF2_W41</code>      | &rarr; | <code>priority_increase_military</code>                     |
| 19 | <code>GOVPRIOlF2_W41</code>      | &rarr; | <code>priority_more_immigration</code>                      |
| 20 | <code>GOVPRIOnF2_W41</code>      | &rarr; | <code>priority_reducing_spending_infrastructure</code>      |
| 21 | <code>GOVPRIOoF2_W41</code>      | &rarr; | <code>priority_climate</code>                               |
| 22 | <code>SOLVPROBa_W41</code>       | &rarr; | <code>sci_tech</code>                                       |
| 23 | <code>SOLVPROBb_W41</code>       | &rarr; | <code>major_corps</code>                                    |
| 24 | <code>SOLVPROBc_W41</code>       | &rarr; | <code>rel_groups</code>                                     |
| 25 | <code>SOLVPROBdF1_W41</code>     | &rarr; | <code>gov_in_wash</code>                                    |
| 26 | <code>SOLVPROBeF2_W41</code>     | &rarr; | <code>state_local</code>                                    |
| 27 | <code>SOLVPROBf_W41</code>       | &rarr; | <code>media</code>                                          |
| 28 | <code>SOLVPROBg_W41</code>       | &rarr; | <code>military</code>                                       |
| 29 | <code>SOLVPROBh_W41</code>       | &rarr; | <code>college_uni</code>                                    |
| 30 | <code>SOLVPROBi_W41</code>       | &rarr; | <code>schools</code>                                        |
| 31 | <code>HARASS1F1a_W41</code>      | &rarr; | <code>harass_false_f1</code>                                |
| 32 | <code>HARASS1F1b_W41</code>      | &rarr; | <code>harass_fired_f1</code>                                |
| 33 | <code>HARASS1F1c_W41</code>      | &rarr; | <code>harass_unpunished_f1</code>                           |
| 34 | <code>HARASS1F1d_W41</code>      | &rarr; | <code>harass_unbelieved_f1</code>                           |
| 35 | <code>HARASS1NOWRKF2a_W41</code> | &rarr; | <code>harass_false_f2</code>                                |
| 36 | <code>HARASS1NOWRKF2c_W41</code> | &rarr; | <code>harass_unpunished_f2</code>                           |
| 37 | <code>HARASS1NOWRKF2d_W41</code> | &rarr; | <code>harass_unbelieved_f2</code>                           |
| 38 | <code>HARASS3F1_W41</code>       | &rarr; | <code>harass_interactions_f1</code>                         |
| 39 | <code>HARASS3NOWRKF2_W41</code>  | &rarr; | <code>harass_interactions_f2</code>                         |
| 40 | <code>HARASS4_W41</code>         | &rarr; | <code>harass_personal_exp</code>                            |
| 41 | <code>HARASS5_W41</code>         | &rarr; | <code>harass_sexual_personal_exp</code>                     |
| 42 | <code>GNATPROB_W41</code>        | &rarr; | <code>worries_federal_government</code>                     |
| 43 | <code>WRKTRN1F1_W41</code>       | &rarr; | <code>most_responsible_for_workers_f1</code>                |
| 44 | <code>WRKTRN2F1_W41</code>       | &rarr; | <code>second_most_responsible_for_workers_f1</code>         |
| 45 | <code>WRKTRN2F2_W41</code>       | &rarr; | <code>second_most_responsible_for_workers_f2</code>         |
| 46 | <code>JOBSECURITY_W41</code>     | &rarr; | <code>job_security</code>                                   |
| 47 | <code>JOBBENEFITS_W41</code>     | &rarr; | <code>job_benefits</code>                                   |
| 48 | <code>AUTOWKPLC_W41</code>       | &rarr; | <code>automation_good_or_bad</code>                         |
| 49 | <code>ROBWRK_W41</code>          | &rarr; | <code>replacement_by_robots_likelihood</code>               |
| 50 | <code>ROBWRK2_W41</code>         | &rarr; | <code>replacement_by_robots_good_or_bad</code>              |
| 51 | <code>AUTOLKLY_W41</code>        | &rarr; | <code>likelihood_my_job_replaced_by_robots</code>           |
| 52 | <code>ROBIMPACTa_W41</code>      | &rarr; | <code>robot_replacement_increase_inequality</code>          |
| 53 | <code>ROBIMPACTb_W41</code>      | &rarr; | <code>robot_replacement_means_better_jobs_for_humans</code> |
| 54 | <code>LEGALIMG_W41</code>        | &rarr; | <code>legal_immigration_levels</code>                       |
| 55 | <code>FUTRCLASSa_W41</code>      | &rarr; | <code>share_americans_in_upper_class</code>                 |
| 56 | <code>FUTRCLASSb_W41</code>      | &rarr; | <code>share_americans_in_middle_class</code>                |
| 57 | <code>FUTRCLASSc_W41</code>      | &rarr; | <code>share_americans_in_lower_class</code>                 |
| 58 | <code>F_EDUCCAT</code>           | &rarr; | <code>highest_education_three_categories</code>             |
| 59 | <code>F_EDUCCAT2</code>          | &rarr; | <code>highest_education_six_categories</code>               |
| 60 | <code>F_HISP</code>              | &rarr; | <code>hispanic_or_latino</code>                             |
| 61 | <code>F_RACECMB</code>           | &rarr; | <code>race</code>                                           |
| 62 | <code>F_RACETHN</code>           | &rarr; | <code>race_and_ethnicity</code>                             |
| 63 | <code>F_NATIVITY</code>          | &rarr; | <code>birthplace</code>                                     |
| 64 | <code>F_CITIZEN</code>           | &rarr; | <code>us_citizen</code>                                     |
| 65 | <code>F_MARITAL</code>           | &rarr; | <code>marital_status</code>                                 |
| 66 | <code>F_RELIG</code>             | &rarr; | <code>religion</code>                                       |
| 67 | <code>F_BORN</code>              | &rarr; | <code>evangelical_christian</code>                          |
| 68 | <code>F_ATTEND</code>            | &rarr; | <code>church_attendance</code>                              |
| 69 | <code>F_PARTY_FINAL</code>       | &rarr; | <code>political_party_identity</code>                       |
| 70 | <code>F_PARTYLN_FINAL</code>     | &rarr; | <code>political_party_lean</code>                           |
| 71 | <code>F_PARTYSUM_FINAL</code>    | &rarr; | <code>summary_of_political_party_data</code>                |
| 72 | <code>F_INCOME</code>            | &rarr; | <code>family_income_nine_categories</code>                  |
| 73 | <code>F_INCOME_RECODE</code>     | &rarr; | <code>family_income_three_categories</code>                 |
| 74 | <code>F_REG</code>               | &rarr; | <code>registered_to_vote</code>                             |
| 75 | <code>F_IDEO</code>              | &rarr; | <code>political_views</code>                                |
| 76 | <code>F_INTUSER</code>           | &rarr; | <code>internet_access</code>                                |
| 77 | <code>F_VOLSUM</code>            | &rarr; | <code>volunteer</code>                                      |

In [9]:
class Attitudes_explore():
    '''
    Performs a series of analyses and explore functions on various features in our data.
    '''
    
    def __init__(self, df):
        self.df =  df
        
    def printout_chi_squared(self):
        categorical_features = df_dummies.drop(columns = ['QKEY', "WEIGHT_W41", "new_target_1"]).columns.to_list()

        for col in categorical_features:
    
            contingency_table = pd.crosstab(df_dummies[col], df_dummies.new_target_1)

            test_results = stats.chi2_contingency(contingency_table)

            print("Correlation between", col, "and the target")
    
            print(test_results)
    
    
            print(" ")