In [2]:
## Import libraries
import pandas as pd
import numpy as np
import regex as re

## Display all rows of pandas dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', None)

In [3]:
## Open databases
df = pd.read_csv("../data/Data_240822_adj.csv")
df_under = pd.read_csv("../data/two_outcomes/undersampled_dataset.csv")
df_dropouts = pd.read_csv("../data/three_outcomes/whole_dataset_dropouts.csv")

In [3]:
## Create dropouts only dataset containing CSSA5 and save it
df_dropouts = df_dropouts.loc[df_dropouts["outcome"] == 2].copy()
features_dropouts = df_dropouts.drop(columns="outcome").columns.to_list()
merged_dropouts = df[features_dropouts].merge(df_dropouts, on=features_dropouts, how="inner")


merged_dropouts.rename(columns={'ASI_Drugs': "ASI-6 Drug-related problems score", 'ASI_Psychiatric':"ASI-6 Psychiatric-related problems score", 
    'D_AlcDrugsNumberTreatments': "ASI-6 Number of drug/alcohol related treatments underwent",
    'D_NicotineRegAbuseRatio': "ASI-6 Ratio between age and years of regular use of nicotine",
    'D_DrugEverCrave': "ASI-6 Drug-related craving symptoms last 30 days before treatment", 
    'T_CTQ': "CTQ Total score" , 'CSSA1.TOTAL': "CSSA Score first week of treatment",
    'D_Crack_30Days': "ASI-6 Days consuming crack the last month before detoxification",
    'ASI_FamilySocialProblem': "ASI-6 Family and social related problems score",
    'T_EmotionalAbuse_CTQ': "CTQ Emotional abuse subscore", 
    'T_EmotinalNeglect_CTQ': "CTQ Emotional neglect subscore",
    'D_CrackCocaine_30Days': "ASI-6 Days consuming crack or cocaine the last month before detoxification",
    'D_DrugUse_30Days': "ASI-6 Days consuming drugs in the last month before detoxification",
     'E_ControlledEnvironment_30Days': "ASI-6 Anytime in controlled environment last 30 days before treatment", 
     'D_CannabisRegAbuseRatio': "ASI-6 Ratio between age and years of regular use of cannabis",
       'T_SexualAbuse_CTQ': "CTQ Sexual abuse subscore"}, inplace=True)


merged_dropouts.to_csv("../data/two_outcomes/only_dropouts_dataset_all_variables.csv", index=False)

In [4]:
## Create undersampled dataset containing CSSA5 and save it
features_under = df_under.drop(columns="outcome").columns.to_list()
features_under_cssa5 = features_under + ["CSSA5.TOTAL"]
merged_under = df[features_under_cssa5].merge(df_under, on=features_under, how="inner")

merged_under.rename(columns={'ASI_Drugs': "ASI-6 Drug-related problems score", 'ASI_Psychiatric':"ASI-6 Psychiatric-related problems score", 
    'D_AlcDrugsNumberTreatments': "ASI-6 Number of drug/alcohol related treatments underwent",
    'D_NicotineRegAbuseRatio': "ASI-6 Ratio between age and years of regular use of nicotine",
    'D_DrugEverCrave': "ASI-6 Drug-related craving symptoms last 30 days before treatment", 
    'T_CTQ': "CTQ Total score" , 'CSSA1.TOTAL': "CSSA Score first week of treatment",
    'D_Crack_30Days': "ASI-6 Days consuming crack the last month before detoxification",
    'ASI_FamilySocialProblem': "ASI-6 Family and social related problems score",
    'T_EmotionalAbuse_CTQ': "CTQ Emotional abuse subscore", 
    'T_EmotinalNeglect_CTQ': "CTQ Emotional neglect subscore",
    'D_CrackCocaine_30Days': "ASI-6 Days consuming crack or cocaine the last month before detoxification",
    'D_DrugUse_30Days': "ASI-6 Days consuming drugs in the last month before detoxification",
     'E_ControlledEnvironment_30Days': "ASI-6 Anytime in controlled environment last 30 days before treatment", 
     'D_CannabisRegAbuseRatio': "ASI-6 Ratio between age and years of regular use of cannabis",
       'T_SexualAbuse_CTQ': "CTQ Sexual abuse subscore"}, inplace=True)


merged_under.to_csv("../data/two_outcomes/undersampled_dataset_all_variables_including_CSSA5.csv", index=False)

In [5]:
## Define features that made it into best logistic regression models
list_old_feature_names = ['ASI_Drugs', 'ASI_Psychiatric', 'D_AlcDrugsNumberTreatments', 'D_NicotineRegAbuseRatio', 'D_DrugEverCrave', 
    'T_CTQ', 'CSSA1.TOTAL', 'D_Crack_30Days', 'ASI_FamilySocialProblem', 'T_EmotionalAbuse_CTQ', 
    'T_EmotinalNeglect_CTQ', 'D_CrackCocaine_30Days', 'D_DrugUse_30Days', 'E_ControlledEnvironment_30Days', 
     'D_CannabisRegAbuseRatio', 'T_SexualAbuse_CTQ', 'outcome']

## Define features that made it into more than half (4/6) of the logistic regression models
list_features_more_than_half = ['ASI_Drugs', 'ASI_Psychiatric', 'D_DrugEverCrave','T_CTQ', 'CSSA1.TOTAL', "outcome"]


## Create dataframes with only selected features
df_under_only_selected_features = df_under[list_old_feature_names].copy()
df_under_only_selected_feature_more_than_half = df_under[list_features_more_than_half].copy()



## Rename columns to more user readable names
df_under_only_selected_features.rename(columns={'ASI_Drugs': "ASI-6 Drug-related problems score", 'ASI_Psychiatric':"ASI-6 Psychiatric-related problems score", 
    'D_AlcDrugsNumberTreatments': "ASI-6 Number of drug/alcohol related treatments underwent",
    'D_NicotineRegAbuseRatio': "ASI-6 Ratio between age and years of regular use of nicotine",
    'D_DrugEverCrave': "ASI-6 Drug-related craving symptoms last 30 days before treatment", 
    'T_CTQ': "CTQ Total score" , 'CSSA1.TOTAL': "CSSA Score first week of treatment",
    'D_Crack_30Days': "ASI-6 Days consuming crack the last month before detoxification",
    'ASI_FamilySocialProblem': "ASI-6 Family and social related problems score",
    'T_EmotionalAbuse_CTQ': "CTQ Emotional abuse subscore", 
    'T_EmotinalNeglect_CTQ': "CTQ Emotional neglect subscore",
    'D_CrackCocaine_30Days': "ASI-6 Days consuming crack or cocaine the last month before detoxification",
    'D_DrugUse_30Days': "ASI-6 Days consuming drugs in the last month before detoxification",
     'E_ControlledEnvironment_30Days': "ASI-6 Anytime in controlled environment last 30 days before treatment", 
     'D_CannabisRegAbuseRatio': "ASI-6 Ratio between age and years of regular use of cannabis",
       'T_SexualAbuse_CTQ': "CTQ Sexual abuse subscore"}, inplace=True)



df_under_only_selected_feature_more_than_half.rename(columns={'ASI_Drugs': "ASI-6 Drug-related problems score",
                            'ASI_Psychiatric':"ASI-6 Psychiatric-related problems score", 
                            'D_DrugEverCrave': "ASI-6 Drug-related craving symptoms last 30 days before treatment", 
                            'T_CTQ': "CTQ Total score" , 'CSSA1.TOTAL': "CSSA Score first week of treatment"}, inplace=True)


## Save dataframes
df_under_only_selected_features.to_csv("../data/two_outcomes/undersampled_dataset_only_selected_model_features.csv", index=False)
df_under_only_selected_feature_more_than_half.to_csv("../data/two_outcomes/undersampled_dataset_only_selected_features_in_more_than_half_of_models.csv", index=False)