In [49]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.model_selection import train_test_split

from charles_acquire import *

## It looks like the columns of `QKEY` & `WEIGHT_W41` are float64 and the rest are categories

In [24]:
def from_categorical_to_object(df: pd.DataFrame, exclude = ['QKEY', 'WEIGHT_W41']) -> pd.DataFrame:
    '''Takes a dataframe and a column name and will iterate through and input into an object
    and then return the dataframe with that column as an object with rows
    '''
    
    # Iterates through dataframe column names and will switch from categorical to object/string
    for col in [name for name in df.columns if name not in exclude]:
            # Targets each individual series
            df[col] = df[col].astype('str')
    
    return df

In [19]:
def train_validate_test_split(df: pd.DataFrame, target: str, seed=123):
    '''This function takes in a dataframe, the name of the target variable
    (for stratification purposes), and an integer for a setting a seed
    and splits the data into train, validate and test. 
    Test is 20% of the original dataset, validate is .30*.80= 24% of the 
    original dataset, and train is .70*.80= 56% of the original dataset. 
    The function returns, in this order, train, validate and test dataframes. 
    '''

    train_validate, test = train_test_split(df, test_size=0.2, 

                                            random_state=seed, 

                                            stratify=df[target])

    train, validate = train_test_split(train_validate, test_size=0.3, 

                                       random_state=seed,

                                       stratify=train_validate[target])

    return train, validate, test

In [54]:
def rename_columns(df: pd.DataFrame) -> pd.DataFrame:
    '''Will take the American Trends Dataframe and rename the columns to a less verbose name
    then it will return that new dataframe
    '''
    
    # Split based on '_41'
    column_names = [col for col in df.columns]
    
    column_name_key = {
        'happena' : 'economy',
        'happenb': 'healthcare',
        'happenc': ''
    }
    
    for c in column_names:
        name = c.replace('_W41', ''
        name = name.replace('F_', '')
        name = name.lower()
                         
    
        print(name)

In [44]:
test = 'F_LANGUAGE'

test.split('F_')[1]

'LANGUAGE'

In [41]:
def wrangle_data():
    # Get origin data from the folder
    df = get_atp_w41_spss_data()

    # Convert the datafrom Categorical to Object/string
    df = from_categorical_to_object(df)

    # Remove optimisim_refused from the dataframe
    df = df[df.OPTIMIST_W41 != 'Refused']
    
    rename_columns(df)
    
    
    return df

In [55]:
# # This will only have this execute when it is the main program being run so it can be easily exported as a pyscript
# if __name__ == '__main__':
    
df = wrangle_data()
df.head()






qkey
new_device_type
language
form
optimist
avgfam
happena
happenb
happenc
happend
happene
happenf
happeng
happenhf1
happenif2
happenj
happen2a
happen2b
happen2c
happen2d
happen2e
happen2f
happen2g
happen2h
natdebt
envc
popprob
ftrworrya
ftrworryb
ftrworryc
ftrworryd
ftrworrye
ftrworryf
eldcare
eldfinancef1
eldfinancef2
govprioa
govpriob
govprioc
govpriod
govprioe
govprioff1
govpriogf1
govpriohf1
govprioif1
govpriojf1
govpriokf2
govpriolf2
govpriomf2
govprionf2
govprioof2
gnatprob
wrktrn1f1
wrktrn1f2
wrktrn2f1
wrktrn2f2
jobsecurity
jobbenefits
autowkplc
robwrk
robwrk2
autolkly
robimpacta
robimpactb
legalimg
futrclassa
futrclassb
futrclassc
harass1f1a
harass1f1b
harass1f1c
harass1f1d
harass1nowrkf2a
harass1nowrkf2c
harass1nowrkf2d
harass3f1
harass3nowrkf2
harass4
harass5
ethncmajmod
ethncmaj3
ethncmaj4
agemaj
intrmar
ssmoney
sscut
futr_abr
futr_div
futr_m
futr_k
solvproba
solvprobb
solvprobc
solvprobdf1
solvprobef2
solvprobf
solvprobg
solvprobh
solvprobi
metro
cregion
agecat
sex
educcat

Unnamed: 0,QKEY,NEW_Device_Type_W41,F_LANGUAGE,FORM_W41,OPTIMIST_W41,AVGFAM_W41,HAPPENa_W41,HAPPENb_W41,HAPPENc_W41,HAPPENd_W41,...,F_PARTY_FINAL,F_PARTYLN_FINAL,F_PARTYSUM_FINAL,F_INCOME,F_INCOME_RECODE,F_REG,F_IDEO,F_INTUSER,F_VOLSUM,WEIGHT_W41
0,100363.0,Mobile phone,English,Form 1,Somewhat optimistic,Get worse,The U.S. economy will be STRONGER,Health care will be MORE affordable,Race relations will IMPROVE,The U.S. will be MORE important in the world,...,Democrat,,Dem/Lean Dem,"$100,000 to less than $150,000","$75,000+",You are ABSOLUTELY CERTAIN that you are regist...,Liberal,Internet User,No,0.599006
1,101224.0,Mobile phone,English,Form 1,Somewhat optimistic,Get better,The U.S. economy will be STRONGER,Health care will be LESS affordable,Race relations will IMPROVE,The U.S. will be MORE important in the world,...,Republican,,Rep/Lean Rep,"$40,000 to less than $50,000","$30-$74,999",You are ABSOLUTELY CERTAIN that you are regist...,Conservative,Internet User,Yes,0.292981
2,101437.0,Desktop,English,Form 1,Somewhat pessimistic,Get worse,The U.S. economy will be WEAKER,Health care will be LESS affordable,Race relations will GET WORSE,The U.S. will be LESS important in the world,...,Republican,,Rep/Lean Rep,"$10,000 to less than $20,000","<$30,000",You are ABSOLUTELY CERTAIN that you are regist...,Conservative,Internet User,No,0.418871
3,102130.0,Mobile phone,English,Form 1,Somewhat optimistic,Stay about the same,The U.S. economy will be WEAKER,Health care will be LESS affordable,Race relations will IMPROVE,The U.S. will be LESS important in the world,...,Independent,Refused,DK/Refused/No lean,Refused,Don't know/Refused,You are ABSOLUTELY CERTAIN that you are regist...,Moderate,Non Internet User,Yes,0.342058
4,103094.0,Mobile phone,English,Form 1,Somewhat optimistic,Stay about the same,Refused,Health care will be LESS affordable,Refused,The U.S. will be LESS important in the world,...,Something else,Refused,DK/Refused/No lean,"$20,000 to less than $30,000","<$30,000",You are NOT registered to vote at your current...,Liberal,Internet User,Yes,0.329465
