In [1]:
import pandas as pd
import os
import numpy as np
import re

In [2]:
survey_years = ['2013', '2015', '2017', '2019', '2022']
current_directory = os.getcwd()

In [3]:
for year in survey_years:
    folder_name = f'{year}'
    # create path to folder
    folder_path = os.path.join(current_directory, 'Patient Characteristics Survey', folder_name)
    # create the folder
    os.makedirs(folder_path, exist_ok=True)
    

In [4]:
# create a class that separates unique values and similar values

# initialize the class
class UniqueColumnValues:
    def __init__(self, df):
        self.df = df
        self.unique_column_values_dict = None

    def unique_column_values(self):
        unique_column_values = {}
        # iterate through each column and append unique values to unique_column_values
        for column in self.df.columns:
            unique_column_values[column] = sorted(self.df[column].unique())
        self.unique_column_values_dict = unique_column_values
        return unique_column_values

    def similar_values(self):
        similar_values = [] 
        values_set = set()  
        for column, unique_values in self.unique_column_values_dict.items(): 
            for value in unique_values:
                if value in values_set:
                    if value not in similar_values:
                        similar_values.append(value)
                else:
                    values_set.add(value)
        return similar_values


In [5]:
class SetColumnValues:
    def __init__(self, df, unique_column_values_instance, similar_values):
        self.df = df
        self.previous_values = {}
        self.unique_column_values_instance = unique_column_values_instance
        self.similar_values = similar_values
        self.column_value_mappings = {}  # store column dictionaries

    def set_column_values(self, level_column_names):
        unique_column_values_dict = self.unique_column_values_instance.unique_column_values()
        for level_column in level_column_names:
            values = unique_column_values_dict.get(level_column, [])
            if level_column not in self.previous_values:
                self.previous_values[level_column] = {}
            for value in values:
                if value not in self.similar_values:
                    if value not in self.previous_values[level_column]:
                        self.previous_values[level_column][value] = len(
                            self.previous_values[level_column])
            # Replace values using the mapping
            self.df[level_column] = self.df[level_column].map(self.previous_values[level_column]).fillna(self.df[level_column])

    def column_dictionaries(self, df_old):
        for column_name in self.df.columns:
            # Create dictionary mapping old values to new values
            old_unique_values = df_old[column_name].unique()
            new_unique_values = [self.previous_values.get(column_name, {}).get(val, val) for val in old_unique_values]
            column_dict = dict(zip(old_unique_values, new_unique_values))
            self.column_value_mappings[column_name] = column_dict


In [6]:
for year in survey_years:
    data_folder_path = os.path.join(current_directory, 'Raw Data')

    PCS_df = f'NYC_PCS{year}'
    PCS_df = pd.read_csv(f'{data_folder_path}/Patient_Characteristics_Survey__PCS___{year}.csv')

    folder_path = os.path.join(current_directory, 'Patient Characteristics Survey', year)

    if year == '2013': 
        PCS_df = PCS_df.rename(columns={'Number of Hours Worked Each Week': 'Number Of Hours Worked Each Week'})
        PCS_df = PCS_df.rename(columns={'Primary Language': 'Preferred Language'})
        
    if year == '2019' or year == '2022':
        PCS_df = PCS_df.drop('Religious Preference', axis=1, errors='ignore')

    column_names = PCS_df.columns.tolist()
    column_name = column_names

    PCS_df['Transgender'] = PCS_df['Transgender'].replace("CLIENT DIDN'T ANSWER", 'CLIENT DID NOT ANSWER')
    PCS_df['Race'] = PCS_df['Race'].replace('UNKNOWN RACE', 'UNKNOWN')
    PCS_df['Living Situation'] = PCS_df['Living Situation'].replace('OTHER LIVING SITUATION', 'OTHER')
    PCS_df['Employment Status'] = PCS_df['Employment Status'].replace('UNKNOWN EMPLOYMENT STATUS', 'UNKNOWN')
    PCS_df['Number Of Hours Worked Each Week'] = PCS_df['Number Of Hours Worked Each Week'].replace('UNKNOWN EMPLOYMENT HOURS', 'UNKNOWN')
    PCS_df['Preferred Language'] = PCS_df['Preferred Language'].replace('ALL OTHER LANGUAGES', 'OTHER')
    PCS_df['Number Of Hours Worked Each Week'] = PCS_df['Number Of Hours Worked Each Week'].replace('0 UNEMPLOYED OR NOT IN LABOR FORCE', 'NOT APPLICABLE')

    if year == '2013':
        PCS_df.rename(columns={'old_column_name': 'new_column_name'}, inplace=True)
        PCS_df['Principal Diagnosis Class'] = PCS_df['Principal Diagnosis Class'].str.strip().replace('UNKNOWN/DEFERRED', 'UNKNOWN')
        PCS_df['Additional Diagnosis Class'] = PCS_df['Additional Diagnosis Class'].str.strip().replace('UNKNOWN/DEFERRED', 'UNKNOWN')
        replacement_emp_status = (
            (PCS_df['Employment Status'] == 'NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK') |
            (PCS_df['Employment Status'] == 'UNEMPLOYED, LOOKING FOR WORK'))

        PCS_df.loc[replacement_emp_status, 'Number Of Hours Worked Each Week'] = '0 UNEMPLOYED OR NOT IN LABOR FORCE'
    else:
        PCS_df.loc[((PCS_df['Employment Status'] == 'NOT IN LABOR FORCE:UNEMPLOYED AND NOT LOOKING FOR WORK') | (
            PCS_df['Employment Status'] == 'UNEMPLOYED, LOOKING FOR WORK')) & (
            PCS_df['Number Of Hours Worked Each Week'] == 'NOT APPLICABLE'), 'Number Of Hours Worked Each Week'] = '0 UNEMPLOYED OR NOT IN LABOR FORCE'

        
    split_variables = ['Program Category', 'Sex', 'Sexual Orientation', 'Race', 'Living Situation', 
                    'Preferred Language', 'Employment Status', 'Principal Diagnosis Class', 
                    'Additional Diagnosis Class', 'Education Status']

    na_values = ['UNKNOWN', 'CLIENT DID NOT ANSWER', 'DATA NOT AVAILABLE']

    for column_name in PCS_df.columns:
        if column_name in split_variables:
            for value in PCS_df[column_name].unique():
                clean_value = f"{column_name.title()} - {value.title()}"
                PCS_df[clean_value] = PCS_df[column_name].apply(
                    lambda x: 'YES' if x == value else ('NO' if x not in na_values else x))
                column_names.append(clean_value)

    PCS_df_copy = PCS_df.copy()

    for column_name in PCS_df_copy.columns:
        PCS_df_copy[column_name] = PCS_df_copy[column_name].replace([
            'NOT APPLICABLE', 'UNKNOWN', 'DATA NOT AVAILABLE', 
            'CLIENT DID NOT ANSWER', "CLIENT DIDN'T ANSWER", 
            'UNKNOWN EMPLOYMENT HOURS'], np.nan)

    datasets = {}

    # Iterate through unique values in the 'Program Category' column
    for category in PCS_df_copy['Program Category'].unique():
        # Filter the dataframe for the current category
        datasets[category] = PCS_df_copy[PCS_df_copy['Program Category'] == category]

    data_folder_path = f'{folder_path}/Data'
    # Assuming 'datasets' is the dictionary containing the split DataFrames
    os.makedirs(data_folder_path, exist_ok=True)

    for category, subset in datasets.items():
        # Create a file name based on the category
        filename = f"{data_folder_path}/{category}.csv"
        # Save the subset to a CSV file
        subset.to_csv(filename, index=False)

    # create an instance of UniqueColumnValues
    column_values = UniqueColumnValues(PCS_df)
    unique_values = column_values.unique_column_values()
    similar_values = column_values.similar_values()

    # construct folder path
    folder_path = os.path.join(current_directory, 'Patient Characteristics Survey', year)
    os.makedirs(folder_path, exist_ok=True)

    # construct file path
    file_name = f'unique_column_values_{year}.txt'
    file_path = os.path.join(folder_path, file_name)  # append file name to folder path

    # write the unique column values to a text file
    with open(file_path, 'w') as file:
        for column, values in unique_values.items():
            file.write(f"Column: {column}\n")
            for value in values:
                file.write(f"  {value}\n")  # add indentation for readability
            file.write("\n")

    file_name = f'column_names_{year}.txt'
    file_path = os.path.join(folder_path, file_name)  # append file name to folder path

    # Define the function to write column names to a text file
    # Open the file in write mode and write the column names to it
    with open(file_path, 'w') as file:
        for column in column_names:
            file.write(column + '\n')

    remove = ['YES', 'NO', False, True, 'OTHER', 'MENTAL ILLNESS', 
            'NOT MI - ORGANIC MENTAL DISORDER', 'NOT MI - DEVELOPMENTAL DISORDERS', 
            'NOT MI - OTHER', 'SUBSTANCE-RELATED AND ADDICTIVE DISORDERS', 
            'MENTAL RETARDATION/DEV. DISORDER', 'ORGANIC MENTAL DISORDER', 
            'PHYSICAL DISORDER', 'SUBSTANCE-RELATED DISORDER', 'DATA NOT AVAILABLE',
            'CLIENT DID NOT ANSWER']

    similar_values.extend(['CLIENT DID NOT ANSWER', "CLIENT DIDN'T ANSWER", 
                        'DATA NOT AVAILABLE'])

    remove_from_similar_values = ['OTHER', 'NO', 'YES', 'MENTAL ILLNESS', 
                                'NOT MI - DEVELOPMENTAL DISORDERS',
                                'NOT MI - ORGANIC MENTAL DISORDER', 'NOT MI - OTHER',
                                'SUBSTANCE-RELATED AND ADDICTIVE DISORDERS', 
                                'MENTAL RETARDATION/DEV. DISORDER', 
                                'ORGANIC MENTAL DISORDER', 'PHYSICAL DISORDER',
                                'SUBSTANCE-RELATED DISORDER', False, True]

    similar_values = [x for x in similar_values if x not in remove_from_similar_values]

    PCS_df_original_values = PCS_df.copy()

    level_1_column_names = column_names

    # Pass the instance of UniqueColumnValues to SetColumnValues
    set_column_values = SetColumnValues(PCS_df, column_values, similar_values)
    set_column_values.set_column_values(level_1_column_names)

    for column_name in PCS_df.columns:
        PCS_df[column_name] = PCS_df[column_name].replace([
            'NOT APPLICABLE', 'UNKNOWN', 'DATA NOT AVAILABLE', 
            'CLIENT DID NOT ANSWER', "CLIENT DIDN'T ANSWER", 
            'UNKNOWN EMPLOYMENT HOURS'], np.nan)

    # Call the column_dictionaries method on the set_column_values object
    column_dictionaries = set_column_values.column_dictionaries(PCS_df_original_values)

    # Access the dictionaries mapping old values to new values
    column_value_mappings = set_column_values.column_value_mappings

    # Sort the column_value_mappings dictionary by column name
    sorted_mappings = sorted(column_value_mappings.items())

    file_name = f'column_mappings_{year}.txt'
    file_path = os.path.join(current_directory, 'Patient Characteristics Survey', year, file_name)

    # Open a text file in write mode
    with open(file_path, 'w') as file:
        # Iterate over each column and its mapping
        for column_name, mapping in sorted_mappings:
            # Write column name
            file.write(f"{column_name}:\n")
            # Write mapping
            for old_value, new_value in sorted(mapping.items()):
                file.write(f"{old_value}: {new_value}\n")
            file.write('\n')  # Add a newline between columns

    file_name = f'modified_NYC_PCS{year}.csv'
    file_path = os.path.join(current_directory, 'Patient Characteristics Survey', year, 'Data', file_name)

    PCS_df.to_csv(file_path, index=False)
