In [None]:
'''
This notebook contains a function, add_smfq_label, that processed a raw dataframe
to add a column indicating depression based on the SMFQ scale.
'''

In [20]:
import pandas as pd
import numpy as np
import pickle
from sklearn.impute import SimpleImputer

In [None]:
def add_smfq_label(raw_data):
    '''This function processes a sweep 6 cm interview dataframe.
    It adds a column 'smfq_label' that indicates whether the YP indicates signs
    of depression based on the answers to the Short Moods and Feelings
    Questionaire (SMFQ).
    It also removes these features once the label has been add.

    EXPECTS:
        A dataframe with unprocessed values (e.g. -9, -8, -1, 1, 2, 3)
        and raw variable names (e.g. 'FCMDSA00')

    RETURNS:
        A dataframe with the SMFQ features removed and an smfq_label column added.
    '''
    # define smfq variables
    smfq_variables = [
        'FCMDSA00',
        'FCMDSB00',
        'FCMDSC00',
        'FCMDSD00',
        'FCMDSE00',
        'FCMDSF00',
        'FCMDSG00',
        'FCMDSH00',
        'FCMDSI00',
        'FCMDSJ00',
        'FCMDSK00',
        'FCMDSL00',
        'FCMDSM00'
    ]

    # filter data for the smfq questions
    smfq_questions = raw_data.loc[:, smfq_variables]

    # replace missing data with the modal value
    data_to_impute = smfq_questions.copy()

    for i in [-1, -8, -9]:
        imp = SimpleImputer(missing_values=i, strategy='most_frequent')
        imp.fit(data_to_impute)

        imputed_values = imp.transform(data_to_impute)
        data_to_impute = pd.DataFrame(columns=data_to_impute.columns, data=imputed_values)

    smfq_questions_imputed = data_to_impute.copy()

    # create a new column with the YP's total SMFQ score
    smfq_map = {
        1: 0,
        2: 1,
        3: 2
    }

    final_scores = []
    for i, yp in smfq_questions_imputed.iterrows():
        final_score = 0
        for feature, value in yp.items():
            final_score += smfq_map[value]

        final_scores.append(final_score)

    smfq_questions_imputed_with_finals = smfq_questions_imputed.copy()

    smfq_questions_imputed_with_finals['smfq_final_score'] = final_scores

    # create a new column with a depressed/not depressed label based on a threshold of 12
    smfq_labels = []
    for i, yp in smfq_questions_imputed_with_finals.iterrows():
        smfq_labels.append(0 if yp.smfq_final_score < 12 else 1)

    smfq_questions_imputed_with_finals_and_threshold = smfq_questions_imputed_with_finals.copy()

    smfq_questions_imputed_with_finals_and_threshold['smfq_label'] = smfq_labels

    # create an output df that drops the smfq questions and adds the final label
    smfq_output_df = raw_data.copy()

    # drop columns
    smfq_output_df.drop(columns=smfq_variables, inplace=True)

    # add label
    smfq_output_df['smfq_label'] = smfq_questions_imputed_with_finals_and_threshold['smfq_label']

    return smfq_output_df

### The code below is a breakdown of what's in the above function

In [11]:
raw_data = pd.read_csv('../../raw_data/sweep_6/UKDA-8156-tab/tab/mcs6_cm_interview.tab', sep='\t')

raw_data.head()

Unnamed: 0,MCSID,FCNUM00,FCCSEX00,FCCDBM00,FCCDBY00,FCCAGE00,FCVERSF0,FCINTROA,FCCHIC0A,FCCHIC0B,...,FCCARR02_TR3,FCCARR03_TR3,FCCARR04_TR3,FCCARR05_TR3,FCCARR06_TR3,FCCARR07_TR3,FCCARR08_TR3,FCCARR09_TR3,FCCARR10_TR3,FCCARR11_TR3
0,M10002P,1,1,3,2001,13,102,1,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,M10007U,1,1,10,2000,14,102,1,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,M10015U,1,2,5,2001,13,102,1,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,M10016V,1,1,11,2000,14,102,1,1,0,...,344,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,M10018X,1,2,7,2001,13,102,1,1,0,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


In [54]:
smfq_variables = [
    'FCMDSA00',
    'FCMDSB00',
    'FCMDSC00',
    'FCMDSD00',
    'FCMDSE00',
    'FCMDSF00',
    'FCMDSG00',
    'FCMDSH00',
    'FCMDSI00',
    'FCMDSJ00',
    'FCMDSK00',
    'FCMDSL00',
    'FCMDSM00'
]

In [56]:
# filter data for the smfq questions
smfq_questions = raw_data.loc[:, smfq_variables]

In [37]:
# replace missing data with the modal value
data_to_impute = smfq_questions.copy()

for i in [-1, -8, -9]:
    imp = SimpleImputer(missing_values=i, strategy='most_frequent')
    imp.fit(data_to_impute)

    imputed_values = imp.transform(data_to_impute)
    data_to_impute = pd.DataFrame(columns=data_to_impute.columns, data=imputed_values)

smfq_questions_imputed = data_to_impute.copy()

In [44]:
# create a new column with the YP's total SMFQ score
smfq_map = {
    1: 0,
    2: 1,
    3: 2
}

final_scores = []
for i, yp in smfq_questions_imputed.iterrows():
    final_score = 0
    for feature, value in yp.items():
        final_score += smfq_map[value]

    final_scores.append(final_score)

smfq_questions_imputed_with_finals = smfq_questions_imputed.copy()

smfq_questions_imputed_with_finals['smfq_final_score'] = final_scores

In [50]:
# create a new column with a depressed/not depressed label based on a threshold of 12
smfq_labels = []
for i, yp in smfq_questions_imputed_with_finals.iterrows():
    smfq_labels.append(0 if yp.smfq_final_score < 12 else 1)

smfq_questions_imputed_with_finals_and_threshold = smfq_questions_imputed_with_finals.copy()

smfq_questions_imputed_with_finals_and_threshold['smfq_label'] = smfq_labels

In [59]:
# create an output df that drops the smfq questions and adds the final label
smfq_output_df = raw_data.copy()

# drop columns
smfq_output_df.drop(columns=smfq_variables, inplace=True)

# add label
smfq_output_df['smfq_label'] = smfq_questions_imputed_with_finals_and_threshold['smfq_label']