In [None]:
'''
This notebook contains functions to convert features from raw values to model-compatible features
'''

In [18]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import FunctionTransformer

In [None]:
'''
Feature categories:
[x] categorical_list_highest_num_not_true
[x] categorical_list_binary_1
[x] categorical_list_binary_0
[] categorical_no_idea
[x] numerical_list_highest_num_is_highest_value
[x] numerical_list_highest_num_is_lowest_value
'''

In [51]:
raw_data = pd.read_csv('../../raw_data/sweep_6/UKDA-8156-tab/tab/mcs6_cm_interview.tab', sep='\t')

In [6]:
# function to get code when only name is available
def get_variable_code(variable_label):

    # get raw data
    data = pd.read_csv('../alex_tracking_system/dict_csv.csv')

    data_filtered = data[data['Variable label'].str.contains(variable_label)]

    try:
        data_filtered_s = data_filtered.iloc[0]
    except:
        return 'VARIABLE NOT FOUND'

    return data_filtered_s['Variable name']

In [7]:
# function to replace negative values with modal
def replace_missing_values(data, variable_code):
    '''
    EXPECTS:
        data: a cm interview dataframe with variable codes for columns and unprocessed values.
        variable_code: a string containing the variable code to replace.

    RETURNS:
        data: a cm interview where the specified column has had missing values replaced.
    '''
    # get modal value
    modal_value = data[variable_code].value_counts().index[0]

    # replace negative values with modal
    data.loc[data[variable_code] < 0, variable_code] = modal_value

    return data


In [67]:
# categorical_list_highest_num_not_true
def categorical_list_highest_num_not_true(data, cols):
    '''
    EXPECTS:
        data: a cm interview dataframe with variable codes for columns and unprocessed values.
        cols: a list of variable codes to be processed.
    RETURNS:
        output: a cm interview where the specified columns have be transformed.
    '''
    output = data.copy()

    for col in cols:
        # define scaler
        scaler = MinMaxScaler(feature_range=(0, 1))

        # replace missing values
        output = replace_missing_values(output, col)

        # turn values negative so that order is reversed
        output.loc[:, col] = output[col] * -1

        # apply minmax scaler
        scaler.fit(output[[col]])
        output.loc[:, col] = scaler.transform(output[[col]])

    return output


In [None]:
# categorical_list_binary_0
def categorical_list_binary_0(data, cols):
    '''
    EXPECTS:
        data: a cm interview dataframe with variable codes for columns and unprocessed values.
        cols: a list of variable codes to be processed.
    RETURNS:
        output: a cm interview where the specified columns have be transformed.
    '''
    output = data.copy()

    for col in cols:
        # replace missing values
        output = replace_missing_values(output, col)

    return output

In [68]:
# categorical_list_binary_1
def categorical_list_binary_1(data, cols):
    '''
    EXPECTS:
        data: a cm interview dataframe with variable codes for columns and unprocessed values.
        cols: a list of variable codes to be processed.
    RETURNS:
        output: a cm interview where the specified columns have be transformed.
    '''
    output = data.copy()

    for col in cols:
        # replace missing values
        output = replace_missing_values(output, col)

        # replace any remaing values that aren't 1
        output.loc[output[col] != 1, col] = 0

    return output

In [None]:
# numerical_list_highest_num_is_highest_value
def numerical_list_highest_num_is_highest_value(data, cols):
    '''
    EXPECTS:
        data: a cm interview dataframe with variable codes for columns and unprocessed values.
        cols: a list of variable codes to be processed.
    RETURNS:
        output: a cm interview where the specified columns have be transformed.
    '''
    output = data.copy()

    for col in cols:
        # define scaler
        scaler = MinMaxScaler(feature_range=(0, 1))

        # replace missing values
        output = replace_missing_values(output, col)

        # apply minmax scaler
        scaler.fit(output[[col]])
        output.loc[:, col] = scaler.transform(output[[col]])


    return output

In [None]:
# numerical_list_highest_num_is_lowest_value
def numerical_list_highest_num_is_lowest_value(data, cols):
    '''
    EXPECTS:
        data: a cm interview dataframe with variable codes for columns and unprocessed values.
        cols: a list of variable codes to be processed.
    RETURNS:
        output: a cm interview where the specified columns have be transformed.
    '''
    output = data.copy()

    for col in cols:
        # define scaler
        scaler = MinMaxScaler(feature_range=(0, 1))

        # replace missing values
        output = replace_missing_values(output, col)

        # turn values negative so that order is reversed
        output.loc[:, col] = output[col] * -1

        # apply minmax scaler
        scaler.fit(output[[col]])
        output.loc[:, col] = scaler.transform(output[[col]])

    return output

In [None]:
function_dict = {
    "categorical_list_highest_num_not_true": categorical_list_highest_num_not_true,
    "categorical_list_binary_1": categorical_list_binary_1,
    "categorical_list_binary_0": categorical_list_binary_0,
    "numerical_list_highest_num_is_highest_value": numerical_list_highest_num_is_highest_value,
    "numerical_list_highest_num_is_lowest_value": numerical_list_highest_num_is_lowest_value
}

In [77]:
'''
For the processor to work, it needs a dict where the keys are the feature
categories, and the values are lists of features within that category.
'''
def feature_processor(data, var_cats):
    '''
    EXPECTS:
        data: a cm interview dataframe with variable codes for columns and unprocessed values.
        var_cats: a dict whose keys are the feature categories and values are
            lists of feature names within each category.
    RETURNS:
        output: a cm interview where the features have be transformed.
    '''
    output = data.copy()

    # for each feature category
    for cat in var_cats:
        # select all features in that category
        features = var_cats[cat]

        # convert variable labels into codes
        var_codes = [get_variable_code(feature) for feature in features]

        # and pass those codes into the appropriate function
        output = function_dict[cat](output, var_codes)

    return output

### Turn into a pipeline object

In [None]:
# FeatureProcessor = FunctionTransformer()

### TESTING

In [None]:
# categorical_list_binary_1_list = [
#     'Has CM ever been attracted to a male'
# ]

# cols = [get_variable_code(var) for var in categorical_list_binary_1_list]

# data_test = categorical_list_binary_1(raw_data, cols)

# print(f"BEFORE: {raw_data[cols[0]].value_counts()}")
# print(f"AFTER: {data_test[cols[0]].value_counts()}")


In [None]:
# categorical_list_highest_num_not_true_list = [
#     'SocSupGrid: I have family and friends who help me feel safe, secure and happy.',
#     'SocSupGrid: There is someone I trust whom I would turn to if I had problems'
# ]

# cols = [get_variable_code(var) for var in categorical_list_highest_num_not_true_list]

# data_test = categorical_list_highest_num_not_true(raw_data, cols)

In [None]:
# print(f"starting values: {raw_data[get_variable_code('SocSupGrid: I have family and friends who help me feel safe, secure and happy.')].value_counts()}")
# print(f"processed values: {data_test[get_variable_code('SocSupGrid: I have family and friends who help me feel safe, secure and happy.')].value_counts()}")

In [10]:
## this is for binary features where 0 = true and 1 = false (done accidentally)
# categorical_list_highest_num_not_true
# def categorical_list_highest_num_not_true(data, cols):
#     '''
#     EXPECTS:
#         data: a cm interview dataframe with variable codes for columns and unprocessed values.
#         cols: a list of variable codes to be processed.
#     RETURNS:
#         output: a cm interview where the specified columns have be transformed.
#     '''

#     for col in cols:
#         # replace missing values
#         data = replace_missing_values(data, col)

#         data_s = data[col]

#         print(data_s.value_counts())

#         # check that there are only two values
#         assert len(data_s.value_counts()) == 2

#         # define max and min values to replace
#         max_value = max(data_s.values)
#         min_value = min(data_s.values)

#         # first 'reserve' values so they aren't overridden in the next step
#         data.loc[data[col] == max_value, col] = 1000000

#         # then replace min values with 1
#         data.loc[data[col] == min_value, col] = 1

#         # finally replace reserved values with 0
#         data.loc[data[col] == 1000000, col] = 0

#     return data

In [None]:
# data_dict = pd.read_csv('../alex_tracking_system/dict_csv.csv')
# data_dict.columns