In [1]:
## Import libraries

import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import regex as re
from sklearn.preprocessing import normalize
from sklearn.feature_selection import VarianceThreshold
from imblearn.over_sampling import ADASYN

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [2]:
'''
Function: outcome_split

Purpose: Make the outcome variable binary, splitting it  within the percentiles
of previously inputed low and high tails. 

Input: Dataframe, outcome variable name, low tail value, high tail value

Output: Dataframe with "outcome" variable split in 1s and 0s as asked by the user

'''

def outcome_split(df, outcome, divider):
    
    ## Divide by divider given
    constraint = df[outcome] <=divider
    constraint2 = df[outcome] > divider
    constraint3 = df[outcome].isna()


    df.loc[constraint, 'outcome'] = int(0)
    df.loc[constraint2, 'outcome'] = int(1)
    df.loc[constraint3,'outcome'] = int(2)
    
    df.dropna(subset = ['outcome'], inplace=True)
    
    df.drop(columns=outcome, inplace=True)
    
    return df

In [3]:
'''
Function: clean_data

Purpose: Transform yes and no into 1 and 0, remove some useless variables, drop variables 
less that 220 entries, only keep numerical variables

Input: Dataframe with the data

Output: Clean dataframe

'''

def clean_data(df):
    
    # Replace yes or no answers for 1 and 0
    df = df.replace("sim", int(1))
    df = df.replace("Sim", int(1))
    df = df.replace("SIM", int(1))
    df = df.replace("s", int(1))
    df = df.replace("S", int(1))
    df = df.replace("yes", int(1))
    df = df.replace("Yes", int(1))
    df = df.replace("YES", int(1))
    df = df.replace("y", int(1))
    df = df.replace("Y", int(1))
    df = df.replace("nao", int(0))
    df = df.replace("Nao", int(0))
    df = df.replace("NAO", int(0))
    df = df.replace("não", int(0))
    df = df.replace("Não", int(0))
    df = df.replace("NÃO", int(0))
    df = df.replace("n", int(0))
    df = df.replace("N", int(0))
    df = df.replace("No", int(0))
    df = df.replace("no", int(0))
    df = df.replace("NO", int(0))
    
    r = re.compile(".*bks*")
    id_columns = list(filter(r.match, df.columns)) # Read Note
    df.drop(columns=id_columns, inplace=True)
    
    r = re.compile(".*hrd*")
    id_columns = list(filter(r.match, df.columns)) # Read Note
    df.drop(columns=id_columns, inplace=True)
    
    r = re.compile("ID_.*")
    id_columns = list(filter(r.match, df.columns)) # Read Note
    df.drop(columns=id_columns, inplace=True)
    
    r = re.compile("TMRAW.*")
    id_columns = list(filter(r.match, df.columns)) # Read Note
    df.drop(columns=id_columns, inplace=True)
    
    r = re.compile("RESP.*")
    id_columns = list(filter(r.match, df.columns)) # Read Note
    df.drop(columns=id_columns, inplace=True)
    
    r = re.compile("G05")
    id_columns = list(filter(r.match, df.columns)) # Read Note
    df.drop(columns=id_columns, inplace=True)
    
    
    df = df.apply(pd.to_numeric, errors='ignore')
    df = df.select_dtypes(include=np.number)
    
    
    
    df.dropna(thresh=220, axis=1, inplace=True)
    
    for var in df.columns:
        if df[var].nunique() == 1:
            df.drop(columns=[var], inplace=True)
    
    print("\nThe clean dataset has", df.shape[0], "rows and", df.shape[1], "columns")
    
    return df

In [4]:
'''
Function: feature_variance_finder

Purpose: Removing variables below a variance threshold

Input: Dataframe with the data and desired variance threshold

Output: Dataframe without the low variance features

'''


def feature_variance_finder(data, thresh):
   
    normalized = normalize(data)
    data_scaled = pd.DataFrame(normalized)
    
    data_scaled.var()
    
    #storing the variance and name of variables
    variance = data_scaled.var()
    columns = data.columns
    
    #saving the names of variables having variance more than a threshold value

    variable = [ ]

    for i in range(0,len(variance)):
        if variance[i]>=thresh: #setting the threshold as 1%
            variable.append(columns[i])
            
    new_data = data[variable]
    
    return new_data

In [5]:
## Open data, only keep women and remove problematic sleep problem variable and ID_continua.int
df = pd.read_csv("../data/Data_240822_adj.csv")
df = df.loc[df["Gender"] == "woman"]
df.drop(columns=["P_SleepProblems_Life", "P_SleepProblemsLastTime", "ID_continua.int", "Unnamed: 0"], inplace=True)

In [6]:
## Create dataset for binary classification analysis
df_original = df.dropna(subset=['T_CTQ', 'ASI_Drugs', 'CSSA5.TOTAL', 'CSSA1.TOTAL']).copy()

## Remove any missing data left
df_original.dropna(inplace=True, axis=1)
df_original.dropna(inplace=True)

In [7]:
## Perform outcome split using CSSA > 21 as high withdrawal outcome
## Outcome 0: CSSA5.TOTAL <= 21
## Outcome 1: CSSA5.TOTAL > 21
df_original = outcome_split(df_original, "CSSA5.TOTAL", 21)

In [8]:
## Create dataset including outcome for people who left treatment
df_dropouts = df.dropna(subset=['T_CTQ', 'ASI_Drugs', 'CSSA1.TOTAL']).copy()

## Perform outcome split using CSSA > 21 as high withdrawal outcome
## Outcome 0: CSSA5.TOTAL <= 21
## Outcome 1: CSSA5.TOTAL > 21
## Outcome 2: CSSA5.TOTAL is missing (patient did not complete detox treatment)
df_dropouts = outcome_split(df_dropouts, "CSSA5.TOTAL", 21)

## Remove any missing data left
df_dropouts.dropna(inplace=True, axis=1)
df_dropouts.dropna(inplace=True)

In [9]:
## Convert variables to numeric, eliminate non-numerical variables and variables with no variation
df_original = clean_data(df_original)

df_dropouts = clean_data(df_dropouts)


The clean dataset has 401 rows and 268 columns

The clean dataset has 525 rows and 262 columns


In [10]:
## Remove variables with very low variability
df_original= feature_variance_finder(df_original, 0.0000001)

df_dropouts = feature_variance_finder(df_dropouts, 0.0000001)

In [11]:
## Save files
df_original.to_csv("../data/two_outcomes/whole_dataset.csv", index=False)


df_dropouts.to_csv("../data/three_outcomes/whole_dataset_dropouts.csv", index=False)

In [12]:
## Create undersampled dataset

from imblearn.under_sampling import RandomUnderSampler

undersample = RandomUnderSampler(sampling_strategy='majority', random_state=27)

X = df_original.drop(['outcome'], axis=1, inplace=False)
y = df_original["outcome"]

X_under, y_under = undersample.fit_resample(X, y)

df_original_under = X_under.copy()

df_original_under["outcome"] = y_under

df_original_under.to_csv("../data/two_outcomes/undersampled_dataset.csv", index=False)