In [12]:
import numpy as np
import pandas as pd
import string
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 400)
pd.set_option('display.max_colwidth', 5000)

file_name = "Tab.delimited.Cleaned.dataset.WITH.variable.labels"
df = pd.read_table(r"./ML1/" + file_name + ".csv", sep='\t', encoding='ISO-8859-1', low_memory=False)

file_name = "Tab.delimited.Cleaned.dataset.WITH.variable.labels"
df1 = pd.read_table(r"./ML1/" + file_name + ".csv", sep='\t', encoding='ISO-8859-1', low_memory=False)

#Getting columns with one unique value, and the remaining are missing
single_valued_col_with_missing = []
for col in list(df):
    unique_values = df[col].unique()
    if unique_values.shape[0] == 2 and any(df[col].isnull()):
        single_valued_col_with_missing.append(col)

regression_cols = ['Ranch1', 'Ranch2', 'Ranch3', 'Ranch4', 'age', \
                  'numparticipants', 'numparticipants_actual', 'sunkDV', 'anchoring1', 'anchoring2', \
                  'anchoring3', 'anchoring4', 'gambfalDV', 'quotearec', 'quotebrec', 'quote', \
                  'totalflagestimations', 'totalnoflagtimeestimations', 'flagdv', 'Sysjust', 'Imagineddv', \
                  'IATexpart', 'IATexpmath', 'IATexp.overall', 'artwarm', 'd_donotuse', 'gamblerfallacya', \
                  'gamblerfallacyb', 'mathwarm', 'moneyagea', 'sample', 'citizenship']
regression_cols.extend(['anchoring'+str(i)+'a' for i in range(1, 5)])
regression_cols.extend(['anchoring'+str(i)+'b' for i in range(1, 5)])
regression_cols.extend(['flagdv'+str(i) for i in range(1, 9)])
regression_cols.extend(['iatexplicitmath'+str(i) for i in range(1, 7)])
regression_cols.extend(['iatexplicitart'+str(i) for i in range(1, 7)])
regression_cols.extend(['imaginedexplicit'+str(i) for i in range(1, 5)])

In [13]:
def preprocess_cols(df):
    # Replace any useless values with NaN
    def check_replace(s):
        if type(s) != str:
            return s
        regex = string.punctuation + ' ' + 'null'
        all_special = all(i in regex for i in s)
        if all_special:
            return np.NaN
        else:
            return s.strip()

    for col in list(df):
        df[col] = df[col].apply(lambda x: check_replace(x))

    # Removing single valued columns
    single_value_cols = []
    for col in list(df):
        if df[col].unique().shape[0] == 1:
            single_value_cols.append(col)

    # Preprocessing of specific columns
    race_dict = {'White': 'White',
                 'Black or African American': 'Black or African American',
                 'East Asian': 'East Asian',
                 'Other or Unknown': 'Other or Unknown',
                 'South Asian': 'South Asian',
                 'More than one race - Other': 'More than one race - Other',
                 'turk': 'turk',
                 'chinese': 'East Asian',
                 'Nederlands': 'Other or Unknown',
                 'More than one race - Black/White': 'More than one race - Other',
                 'brazilwhite': 'Other or Unknown',
                 'brazilbrown': 'Other or Unknown',
                 'American Indian/Alaska Native': 'Other or Unknown',
                 'Native Hawaiian or other Pacific Islander': 'Other or Unknown',
                 'brazilblack': 'Other or Unknown',
                 'brazilyellow': 'Other or Unknown',
                 'indian': 'South Asian',
                 'malay': 'Other or Unknown',
                 'Nederlandse': 'Other or Unknown',
                 'nederlands': 'Other or Unknown',
                 'Belgisch Nederlands': 'Other or Unknown',
                 'Marokkaans Nederlands': 'Other or Unknown',
                 'brazilindigenous': 'Other or Unknown',
                 'italiaans nederlands': 'Other or Unknown',
                 'Turks Nederlands': 'Other or Unknown',
                 'duits': 'Other or Unknown',
                 'Russian': 'Other or Unknown',
                 'nl': 'Other or Unknown',
                 'Duits': 'Other or Unknown'}

    df.race = df.race.apply(lambda x: x if pd.isnull(x) else race_dict[x])

    scalesa_dict = {'Up to a half hour': 1,
                    'Half an hour to an hour': 2,
                    'One to one and a half hours': 3,
                    'One and a half to two hours': 4,
                    'Two to two and a half hours': 5,
                    'More than two and a half hours': 6}

    df.scalesa = df.scalesa.apply(lambda x: x if pd.isnull(x) else scalesa_dict[x])

    scalesb_dict = {'Up to two and a half hours': 1,
                    'Two and a half to three hours': 2,
                    'Three to three and a half hours': 3,
                    'Three and a half to four hours': 4,
                    'Four to four and a half hours': 5,
                    'More than four and a half hours': 6}

    df.scalesb = df.scalesb.apply(lambda x: x if pd.isnull(x) else scalesb_dict[x])

    noflagtimeestimate_dict = {'Morning': 1,
                               'Afternoon': 2,
                               'Evening': 3}

    df.noflagtimeestimate1 = df.noflagtimeestimate1.apply(lambda x: x if pd.isnull(x) else noflagtimeestimate_dict[x])
    df.noflagtimeestimate2 = df.noflagtimeestimate2.apply(lambda x: x if pd.isnull(x) else noflagtimeestimate_dict[x])
    df.noflagtimeestimate3 = df.noflagtimeestimate3.apply(lambda x: x if pd.isnull(x) else noflagtimeestimate_dict[x])
    df.noflagtimeestimate4 = df.noflagtimeestimate4.apply(lambda x: x if pd.isnull(x) else noflagtimeestimate_dict[x])

    # Any text in a language other than English has value 10, and different variations showing levels of preference
    # over art or math are encoded from 1 to 9
    text_dict = {
        'Your data suggest a strong preference for Arts compared to Mathematics.': 5,
        'Your data suggest a moderate preference for Arts compared to Mathematics.': 4,
        'Your data suggest little or no preference for Mathematics compared Arts.': 6,
        'Your data suggest a slight preference for Arts compared to Mathematics.': 3,
        'Your data suggest a slight preference for Mathematics compared to Arts.': 7,
        'There were too many errors made to determine a result.': 0,
        'Your data suggest a moderate preference for Mathematics compared to Arts.': 8,
        'There were too many fast trials to determine a result.': 1,
        'Your data suggest a strong preference for Mathematics compared to Arts.': 9
    }

    df.text = df.text.apply(lambda x: x if pd.isnull(x) else (10 if x not in text_dict.keys() else text_dict[x]))

    
    # Since we expect that we won't enter this column if we don't get any error, 'nan' means that we actually
    # didn't get any error. Hence, imputation in this column seems to be obvious.
    df['imptaskto'] = df['imptaskto'].apply(lambda x : 0 if pd.isnull(x) else 1)

    iatexplicitart1_dict = {'Very bad': 2, 
                           'Moderately bad': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6, 
                           '7': 7}
    df['iatexplicitart1'] = df['iatexplicitart1'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitart1_dict[x])

    iatexplicitart2_dict = {'Very Sad': 2, 
                           'Moderately Sad': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6, 
                           '7': 7}
    df['iatexplicitart2'] = df['iatexplicitart2'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitart2_dict[x])

    iatexplicitart3_dict = {'Very Ugly': 2, 
                           'Moderately Ugly': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6, 
                           '7': 7}
    df['iatexplicitart3'] = df['iatexplicitart3'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitart3_dict[x])

    iatexplicitart4_dict = {'Very Disgusting': 2, 
                           'Moderately Disgusting': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6, 
                           '7': 7}
    df['iatexplicitart4'] = df['iatexplicitart4'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitart4_dict[x])

    iatexplicitart5_dict = {'Very Avoid': 2, 
                           'Moderately Avoid': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6, 
                           '7': 7}
    df['iatexplicitart5'] = df['iatexplicitart5'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitart5_dict[x])

    iatexplicitart6_dict = {'Very Afraid': 2, 
                           'Moderately Afraid': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6, 
                           '7': 7}
    df['iatexplicitart6'] = df['iatexplicitart6'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitart6_dict[x])
    
    iatexplicitmath1_dict = {'Very bad': 1, 
                           'Moderately bad': 2, 
                             'Slightly bad': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6}
    df['iatexplicitmath1'] = df['iatexplicitmath1'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitmath1_dict[x])


    iatexplicitmath2_dict = {'Very Sad': 1, 
                           'Moderately Sad': 2, 
                             'Slightly Sad': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6}
    df['iatexplicitmath2'] = df['iatexplicitmath2'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitmath2_dict[x])

    iatexplicitmath3_dict = {'Very Ugly': 1, 
                           'Moderately Ugly': 2, 
                             'Slightly Ugly': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6}
    df['iatexplicitmath3'] = df['iatexplicitmath3'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitmath3_dict[x])

    iatexplicitmath4_dict = {'Very Disgusting': 1, 
                           'Moderately Disgusting': 2, 
                             'Slightly Disgusting': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6}
    df['iatexplicitmath4'] = df['iatexplicitmath4'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitmath4_dict[x])

    iatexplicitmath5_dict = {'Very Avoid': 1, 
                           'Moderately Avoid': 2, 
                             'Slightly Avoid': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6}
    df['iatexplicitmath5'] = df['iatexplicitmath5'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitmath5_dict[x])

    iatexplicitmath6_dict = {'Very Afraid': 1, 
                           'Moderately Afraid': 2, 
                             'Slightly Afraid': 3, 
                           '4': 4, 
                           '5': 5, 
                           '6': 6}
    df['iatexplicitmath6'] = df['iatexplicitmath6'].apply(lambda x: np.NaN if pd.isnull(x) else iatexplicitmath6_dict[x])
    
    exprunafter2_dict = {'Your past and your future': 1, 
                        'Thinking and Reasoning':2, 
                        'trust game':3, 
                        'A study on intentionally. Takes 5 minutes to complete. Read a scenario and answer questions about the intentions of the actor.': 4, 
                        'GROUPS':5, 
                        'Groups':5, 
                        'thinking and reasoning':6, 
                        'Emotion and verbal working memory span task':7, 
                        'Emotion and Verbal Working Memory':8, 
                        'Other':9}
    df['exprunafter2'] = df['exprunafter2'].apply(lambda x: np.NaN if pd.isnull(x) else (exprunafter2_dict['Other'] \
                                                                   if x not in exprunafter2_dict.keys() else \
                                                                   exprunafter2_dict[x]))  
    
    exprace_dict = {'1': 1,
                 '2': 2,
                 '3': 3,
                 'dutch': 4,
                  '5': 5,
                  '6': 6,
                  '7': 7,
                  '8': 8,
                  '9': 9,
                  '10': 10,
                  'malay': 11,
                  'chinese': 12,
                  'brazilwhite': 13,
                  'brazilblack': 14,
                  'brazilbrown': 15}  

    df.exprace = df.exprace.apply(lambda x: x if pd.isnull(x) else exprace_dict[x])

    sample_dict = {'abington': 1,
                'brasilia': 2,
                'charles': 3,
                'conncoll': 4,
                'csun': 5,
                'help': 6,
                'ithaca': 7,
                'jmu': 8, 
                 'ku': 9, 
                 'laurier': 10, 
                 'lse': 11, 
                 'luc': 12, 
                 'mcdaniel': 13, 
                 'msvu': 14,
                 'mturk': 15, 
                 'osu': 16,
                 'oxy': 17, 
                 'pi': 18, 
                 'psu': 19, 
                 'qccuny': 20, 
                 'qccuny2': 21, 
                 'sdsu': 22,
                 'swps': 23,
                 'swpson': 24,
                 'tamu': 25,
                 'tamuc': 26,
                 'tamuon': 27,
                 'tilburg': 28,
                 'ufl': 29,
                 'unipd': 30, 
                 'uva': 31, 
                 'vcu': 32, 
                 'wisc': 33, 
                 'wku': 34, 
                 'wl': 35, 
                 'wpi': 36}

    df['sample']= df['sample'].apply(lambda x: x if pd.isnull(x) else sample_dict[x])

#     citizenship_dict = {'US': 1, 'CN': 2, 'IN': 3, 'UA': 4, 'PR': 5, 'HT': 6, 'RU': 7, 
#                       'JP': 8, 'GW': 9, 'MO': 10, 'BR': 11, 'AO': 12, 'CZ': 13, 'SK': 14, 
#                       'EC': 15, 'CO': 16, 'FR': 17, 'KZ': 18, 'KR': 19, 'KW': 20, 'CR': 21,
#                       'IQ': 22, 'MY': 23, 'MV': 24, 'BN': 25, 'SG': 26, 'CA': 27, 'GY': 28, 
#                       'AU': 29, 'TR': 30, 'NG': 31, 'CM': 32, 'PK': 33, 'EG': 34, 'RO': 35, 
#                       'HK': 36, 'DK': 37, 'IE': 38, 'UK': 39, 'PL': 40, 'IT': 41, 'ES': 42, 
#                       'KE': 43, 'LT': 44, 'PH': 45, 'TH': 46, 'VN': 47, 'GR': 48, 'IL': 49, 
#                       'PT': 50, 'FI': 51, 'BD': 52, 'CL': 53, 'BE': 54, 'NL': 55, 'TW': 56, 
#                       'CY': 57, 'NZ': 58, 'BG': 59, 'DE': 60, 'SE': 61, 'AF': 62, 'MX': 63, 
#                       'AL': 64, 'BM': 65, 'BS': 66, 'TT': 67, 'MK': 68, 'PE': 69, 'BO': 70, 
#                       'LA': 71, 'LK': 72, 'BY': 73, 'AE': 74, 'HU': 75, 'SN': 76, 'PA': 77, 
#                       'NI': 78, 'PW': 79, 'UG': 80, 'MH': 81, 'CV': 82, 'NP': 83, 'IR': 84, 
#                       'UZ': 85, 'DO': 86, 'PY': 87, 'ID': 88, 'SA': 89, 'other': 90, 'HN': 91, 
#                       'AT': 92, 'DM': 93, 'VG': 94, 'MA': 95, 'GH': 96, 'SV': 97, 'ZM': 98, 
#                       'ZA': 99, 'JM': 100, 'GT': 101}
    citizenship_dict = {'US': 1, 'CN': 2, 'IN': 3, 'UA': 4, 'PR': 5, 'HT': 6, 'RU': 7, 
                      'JP': 8, 'GW': 9, 'MO': 10, 'BR': 11, 'AO': 12, 'CZ': 13, 'SK': 14, 
                      'EC': 15, 'CO': 16, 'FR': 17, 'KZ': 18, 'KR': 19, 'KW': 20, 'CR': 21,
                      'IQ': 22, 'MY': 23, 'MV': 24, 'BN': 25, 'SG': 26}

    df.citizenship = df.citizenship.apply(lambda x: x if pd.isnull(x) else (27 \
                                                                           if x not in citizenship_dict.keys()\
                                                                           else citizenship_dict[x]))

    flagsupplement1_dict = {'Very much' : 11, 
                          '6': 6, 
                          '4': 4, 
                          '8': 8, 
                          '7': 7, 
                          '5': 5, 
                          'Not at all': 1, 
                          '9': 9, 
                          '10': 10, 
                          '3': 3,
                          '2': 2}

    df.flagsupplement1 = df.flagsupplement1.apply(lambda x: x if pd.isnull(x) else flagsupplement1_dict[x])

    flagsupplement2_dict = {'4': 4, 
                          'Republican': 7, 
                          '5': 5, 
                          '3': 3, 
                          'Democrat': 1, 
                          '2': 2, 
                          '6': 6}

    df.flagsupplement2 = df.flagsupplement2.apply(lambda x: x if pd.isnull(x) else flagsupplement2_dict[x])

    flagsupplement3_dict = {'4': 4, 
                          '5': 5, 
                          '3': 3, 
                          '6': 6, 
                          'Conservative': 7, 
                          '2': 2, 
                          'Liberal': 1}

    df.flagsupplement3 = df.flagsupplement3.apply(lambda x: x if pd.isnull(x) else flagsupplement3_dict[x])

    artwarm_dict = {'72': 72, '61': 61, '66': 66, '54': 54, '33': 33, '65': 65, '55': 55, '78': 78, '100': 100,
                  '87': 87, '79': 79, '39': 39, '92': 92, '99': 99, '98': 98, '84': 84, '0': 0, '35': 35, 
                  '30': 30, '96': 96, '62': 62, '69': 62, '40': 40, '48': 48, '81': 81, '73': 73, '20': 20, 
                  '63': 63, '82': 82, '74': 74, '95': 95, '56': 56, '76': 76, '23': 23, '80': 80, '59': 59, 
                  '27': 27, '93': 93, '53': 53, '64': 64, '83': 83, '47': 47, '13': 13, '6': 6, '31': 33, 
                  '52': 52, '60': 60, '28': 28, '68': 68, '88': 88, '75': 75, '86': 86, '67': 67, '70': 70, 
                  '71': 71, '90': 90, '89' :89, '57': 57, '85': 85, '25': 25, '24': 24, '42': 42, '97': 97, 
                  '50': 50, '10': 10, '91': 91, '94': 94, '77': 77, '17': 77, '26': 26, '5': 5, '51': 51, 
                  '49': 49, '34': 34, '29': 29, '16': 16, '32': 32, '41': 41, '38': 38, '8': 8, '37': 37, 
                  '46': 46, '19': 19, '2': 2, '45': 45, '44': 44, '14': 14, '21': 21, '58': 58, '4': 4, '22': 22, 
                  '3': 3, '18': 18, '43': 43, '9': 9, '12': 12, '7': 7, '36': 36, '15': 15, '1': 1, '11': 11}

    df.artwarm = df.artwarm.apply(lambda x: x if pd.isnull(x) else artwarm_dict[x])    
    
    i = 0
    for col in list(df):
        if i > 154 and i < 163:
            if i == 157 or i == 161:
                #print("here")
                df = df.replace("Strongly agree", str(1))
                df = df.replace("Strongly disagree", str(7))
            else:
                df = df.replace("Strongly agree", str(7))
                df = df.replace("Strongly disagree", str(1))
        i = i + 1
        
    for col in ['sysjust'+str(i) for i in range(1, 9)]:
        df[col] = df[col].apply(lambda x: np.NaN if pd.isnull(x) else float(x))
        
    # Making sure all the regression columns have numeric values, not numbers in string format
    for col in regression_cols:
#         try:
        df[col] = df[col].apply(lambda x: np.NaN if pd.isnull(x) else float(x))
#         except ValueError:
#             print(col)
            
            
    # removing nlp columns (columns that have more than 35 unique values)
    nlp_cols = []
    i = 0
    for col in list(df):
        if df[col].dtype == 'float64' or df[col].dtype == 'int64':
            i = i + 1
            continue
        if len(df[col].unique()) > 35:
            nlp_cols.append(col)
        i = i + 1

        
    def assign_label(x):
        global unique
        global number
        if pd.isnull(x):
            return np.NaN
        if x in unique.keys():
            return unique[x]
        else:
            unique[x] = number
            number = number + 1
            return (number - 1)

    exclude_cols = ['scalesa', 'scalesb', 'noflagtimeestimate1', 'noflagtimeestimate2', 'noflagtimeestimate3', \
                    'noflagtimeestimate4', 'text', 'exprunafter2', 'imptaskto', 'iatexplicitart1', \
                    'iatexplicitart2', 'iatexplicitart3', 'iatexplicitart4', 'iatexplicitmath1', \
                    'iatexplicitmath2', 'iatexplicitmath3', 'iatexplicitmath4', 'artwarm', 'flagsupplement1', \
                   'flagsupplement2', 'flagsupplement3', 'citizenship', 'sample', 'exprace']
    exclude_cols.extend(["sysjust" + str(i) for i in range(1, 9)])
            
    i = 0
    for col in list(df):
        if col in exclude_cols or col in regression_cols:
            i = i + 1
            continue
        global unique
        global number
        unique = {}
        number = 0
        df[col] = df[col].apply(lambda x: assign_label(x))

    remove_cols = ['user_id', 'previous_session_id', 'previous_session_schema', 'user_agent', 'citizenship2', \
                   'mturk.non.US', 'mturk.exclude', 'session_id', 'session_date', 'last_update_date', \
                  'session_last_update_date', 'session_creation_date', 'expcomments', 'Ranchori', 'RAN001', \
                  'RAN002', 'RAN003', 'feedback', 'imagineddescribe']
    remove_cols.extend(["task_url." + str(i) for i in range(46)])
    remove_cols.extend(["task_creation_date." + str(i) for i in range(46)])

    # temporarily removed columns, can be considered later
    remove_cols.extend(['religion'])
    remove_cols.extend(["task_id." + str(i) for i in range(46)])

    remove_cols.extend(single_value_cols)
    remove_cols.extend(nlp_cols)

    remove_cols = list(set(remove_cols))
    df = df.drop(remove_cols, axis=1)

    # Saving and loading modified dataframe
    df.to_csv(r"ML1/" + file_name + ".preprocessed1" + ".csv", sep=',', encoding='utf-8', index=False)
    return df

In [3]:
def one_hot_for_all(df, index):
    data = df.iloc[:, index].tolist()
    #print(data)
    
    t = df.iloc[:, index].unique()
    
    if df.shape[0] - df.iloc[:, index].value_counts().sum() !=0:
        num_of_vars = len(t) - 1
    else:
        num_of_vars = len(t)
    
    #num_of_vars = 7
    #test_data = []
    #k = 0
    #for k in range(num_of_vars):
        #test_data.append(str(num_of_vars + 1))
    #testdata = ['1', '2', '3', '4', '5', '6', '7']

#     numbers = []
#     j = 0
#     for j in range(num_of_vars):
#         numbers.append(j)
#         j+1
    temp_list = list(t)
    numbers = []
    for num in temp_list:
        if pd.isnull(num):
            continue
        numbers.append(num)

    numbers = sorted(numbers)
    #numbers = '1234567'
    try:
        char_to_int = dict((float(c), i) for i, c in enumerate(numbers))
    except ValueError:
        print("ValueError at ", index)
        return
#     print("char to int ", char_to_int)
    int_to_char = dict((i, c) for i, c in enumerate(numbers))
#     print("int to char ", int_to_char)
    # integer encode input data
    integer_encoded = []
    #for num in testdata:
    for num in data:
        if not pd.isnull(num):
            integer_encoded.append(char_to_int[num])
        else:
            integer_encoded.append(num)
    #integer_encoded = [char_to_int[num] for num in testdata]
    #print(integer_encoded)

    #One hot encoding
    one_hot_encoded = list()
    for value in integer_encoded:
        if not pd.isnull(value):
            bit = [0 for _ in range(len(numbers))]
            bit[value] = 1
        else:
            bit = np.NaN
        one_hot_encoded.append(bit)

    concat_list = []
    for j in range(len(one_hot_encoded)):
        result = ''
        if type(one_hot_encoded[j]) != list:
            concat_list.append(np.NaN)
            continue
        for element in one_hot_encoded[j]:
            result = result + str(element)
        concat_list.append(result)    
        #concat_list.append(''.join(map(str,one_hot_encoded[i])))
    df1 = pd.DataFrame(concat_list)
    #print(df1[0])
#   print(df1)
    df.iloc[:, index] = df1[0]
    return df

In [14]:
df = preprocess_cols(df)

In [15]:
df.to_csv(r"ML1/" + file_name + ".preprocessed1" + ".csv", sep=',', encoding='utf-8', index=False)

In [None]:
pd.get_dummies(df.iloc[:, ]dummy_na=True)

In [5]:
for col in list(df):
    if col in regression_cols:
        continue
    i = list(df).index(col)
    df = one_hot_for_all(df, i)

In [6]:
df.to_csv(r"ML1/" + file_name + ".preprocessed_hot" + ".csv", sep=',', encoding='utf-8', index=False)