# Load Data

In [3]:
import os
import pandas as pd


# EDA (Exploratory Data Analysis)

## Survey Answer Meaning

                               0  Never
                               1  Often
                               2  Sometimes
                               3  Yes, but not in the last 12 months
                               4  Yes, but frequency in last 12 months missing
                           (m) 9  Missing
                          (na)    Not applicable

Logic:
1. Technically, the meanings of 'missing' will be same as the 'na' value, here we assume they are together (tbh there is no value '9' in all the attribute columns)
2. Here, we are still using the 'smaller means more serious'. The computational method will be: SUM(all attribute values) / # not na or 9 or 0 col = avg of the valid answer
3. This algo should apply for each type of violence and the overall one
??: the formula should consider the number of vaild answer

# Data Cleaning

In [4]:
emotional_list = ['d101a', 'd101b', 'd101c', 'd101d', 'd101e', 'd101f', 'd103a', 'd103b']
physical_list = ['d105a', 'd105b', 'd105c', 'd105d', 'd105e', 'd105f', 'd105g']
sexual_list = ['d105h', 'd105i', 'd105k']

In [5]:
attribute_lists = {
    'emotional': emotional_list,
    'physical': physical_list,
    'sexual': sexual_list
}

In [6]:
#new
def data_cleaning(raw_data, attribute_lists):
    # Read the raw CSV dataset
    whole_data = raw_data

    # Convert column names to lowercase
    whole_data.columns = whole_data.columns.str.lower()


    # Drop the NA rows from the main dataset
    cleaned_data = whole_data.dropna(how='all')

    # Separate rows where all values in the attribute lists are NA
    attribute_columns = [col for cols in attribute_lists.values() for col in cols if col in cleaned_data.columns]
    attribute_data = cleaned_data[attribute_columns]
    all_na_rows = cleaned_data[attribute_data.isna().all(axis=1)]
    cleaned_violence_data = cleaned_data[~cleaned_data.index.isin(all_na_rows.index)]

    # Reset the index so that it can be tracked by ID later
    cleaned_violence_data = cleaned_violence_data.reset_index(drop=True)
    cleaned_violence_data['unique_id'] = cleaned_violence_data.index + 1

    # Reorder columns to have 'unique_id' first
    cols = ['unique_id'] + [col for col in cleaned_violence_data.columns if col != 'unique_id']
    cleaned_violence_data = cleaned_violence_data[cols]

    return cleaned_violence_data, all_na_rows


In [7]:
y_list=['d101a', 'd101b', 'd101c', 'd101d', 'd101e', 'd101f', 'd103a',
            'd103b','d105a', 'd105b', 'd105c', 'd105d', 'd105e', 'd105f', 'd105g',
            'd105h', 'd105i', 'd105k']



# Response Variable (y) Calculation

In [8]:
def compute_valid_and_sum(df, attribute_lists):
    # Initialize overall metrics
    df['overall_valid_count'] = 0
    df['overall_valid_sum'] = 0.0

    for name, columns in attribute_lists.items():
        existing_columns = [col for col in columns if col in df.columns]
        if not existing_columns:  # If no columns exist, set metrics to 0
            df[f'{name}_valid_count'] = 0
            df[f'{name}_valid_sum'] = 0.0
            df[f'{name}_avg'] = 0
        else:
            # Replace 0 and 9 with NaN to consider them as invalid
            valid_df = df[existing_columns].replace([0, 9], float('nan'))
            # Compute valid counts and sum for the attribute list
            df[f'{name}_valid_count'] = valid_df.notna().sum(axis=1)
            df[f'{name}_valid_sum'] = valid_df.sum(axis=1)
            # Calculate average, handling division by zero by filling NaN with 0
            df[f'{name}_avg'] = (df[f'{name}_valid_sum'] / df[f'{name}_valid_count']).fillna(0)

        # Update the overall valid count and sum
        df['overall_valid_count'] += df[f'{name}_valid_count']
        df['overall_valid_sum'] += df[f'{name}_valid_sum']

    # Compute the overall average and handle division by zero
    df['overall_avg'] = (df['overall_valid_sum'] / df['overall_valid_count']).fillna(0)

    return df

# Metadata Mapping

In [9]:
y_list=['d101a', 'd101b', 'd101c', 'd101d', 'd101e', 'd101f', 'd103a',
            'd103b','d105a', 'd105b', 'd105c', 'd105d', 'd105e', 'd105f', 'd105g',
            'd105h', 'd105i', 'd105k']

emotional_list = ['d101a', 'd101b', 'd101c', 'd101d', 'd101e', 'd101f', 'd103a', 'd103b']
physical_list = ['d105a', 'd105b', 'd105c', 'd105d', 'd105e', 'd105f', 'd105g']
sexual_list = ['d105h', 'd105i', 'd105k']


attribute_lists = {
    'emotional': emotional_list,
    'physical': physical_list,
    'sexual': sexual_list
}

In [10]:
def metadata(metadata_path):
    try:
        metadata = pd.read_csv(metadata_path, encoding='utf-8')
    except UnicodeDecodeError:
        # Try a different encoding if UTF-8 does not work
        metadata = pd.read_csv(metadata_path, encoding='ISO-8859-1')
    metadata['Item Name'] = metadata['Item Name'].apply(lambda x: x.lower() if isinstance(x, str) else x)
    return metadata

In [11]:
new_des=['Never','Yes, but not in the last 12 months','Sometimes','Often']



In [12]:
def apply_mappings(data, mappings):

    for column, mapping in mappings.items():
        if column in data.columns:
            # Convert mapping keys from string to appropriate type, if possible
            try:

                mapping_int_keys = {int(k): v for k, v in mapping.items()}

            except ValueError:
                # If conversion fails, use the mapping as is
                mapping_int_keys = mapping

            # Apply the mapping to the DataFrame column using map on the specific series
            data[column] = data[column].map(mapping_int_keys).fillna(data[column])

    return data



In [32]:
main_folder = "E:/all_asian_data(90%)_test/Individual"

dta_files = []


for root, dirs, files in os.walk(main_folder):
    dirs.sort(reverse=True)
    files.sort(reverse=True)
    for file in files:
        if file.endswith('on.csv'):
            meta_path = os.path.join(root, file)
            try:
                metadata = pd.read_csv(meta_path, encoding='utf-8')
            except UnicodeDecodeError:
             # Try a different encoding if UTF-8 does not work
                metadata = pd.read_csv(meta_path, encoding='ISO-8859-1')
            if len(metadata)==0:
                break  
            metadata['Item Name'] = metadata['Item Name'].apply(lambda x: x.lower() if isinstance(x, str) else x)
            try:
                for i in y_list:
                    indices = metadata[metadata['Item Name'] == i].index
                    for num in range(4):
                        metadata.loc[indices[num+1], 'Description'] = new_des[num]
            except IndexError as e:
                break
            
            mappings = {}
            for item in metadata['Item Name'].unique():
                mappings[item] = dict(zip(metadata[metadata['Item Name'] == item]['Code'],
                                            metadata[metadata['Item Name'] == item]['Description']))
            print(meta_path) 
        if file.endswith(').csv'):
            file_path = os.path.join(root, file)
            print(file_path)
            raw_data=pd.read_csv(file_path)
            clean_data, all_na_rows = data_cleaning(raw_data, attribute_lists)
            clean = clean_data.loc[:, ~clean_data.columns.duplicated()]
            if len(clean)==0:
                break
            clean[y_list]=clean[y_list].replace({0:1,1:4,2:3,3:2})
            result = compute_valid_and_sum(clean, attribute_lists)
            df = apply_mappings(result, mappings)
            for i in df.columns:
                try:
                    name=list(metadata[metadata['Item Name']==i]['Item Label'])[0]
                    df = df.rename(columns={i: name})
                except IndexError as e:
                    print('')
            
            if len(df)!=0:               
                dta_files.append(df)

E:/all_asian_data(90%)_test/Individual\Philippines Standard DHS 2022\Philippines Standard DHS 2022_variable_description.csv
E:/all_asian_data(90%)_test/Individual\Philippines Standard DHS 2022\Philippines Standard DHS 2022(90).csv


  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys)







































































































































































































































































































































































































































































































E:/all_asian_data(90%)_test/Individual\Nepal Standard DHS 2022\Nepal Standard DHS 2022_variable_description.csv
E:/all_asian_data(90%)_test/Individual\Nepal Standard DHS 2022\Nepal Standard DHS 2022(90).csv


  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys)









































































































































































































































































































































































































































































































































































E:/all_asian_data(90%)_test/Individual\Cambodia Standard DHS 2021-22\Cambodia Standard DHS 2021-22_variable_description.csv
E:/all_asian_data(90%)_test/Individual\Cambodia Standard DHS 2021-22\Cambodia Standard DHS 2021-22(90).csv


  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys).fillna(data[column])
  data[column] = data[column].map(mapping_int_keys)















































































































































































































































































































































































































































































































































































In [30]:
num=0
for i in dta_files:
    num+=len(i)
num

25313

In [33]:
dta_files = [df.T.drop_duplicates().T for df in dta_files]

In [41]:
list(dta_files[1].columns)

['unique_id',
 'Case Identification',
 'Country code and phase',
 'Cluster number',
 'Household number',
 "Respondent's line number",
 "Women's individual sample weight (6 decimals)",
 'Month of interview',
 'Year of interview',
 'Date of interview (CMC)',
 'Date of interview Century Day Code (CDC)',
 "Respondent's month of birth",
 "Respondent's year of birth",
 'Date of birth (CMC)',
 "Respondent's current age",
 'Age in 5-year groups',
 'Completeness of age information',
 'Result of individual interview',
 'Day of interview',
 'CMC start of calendar',
 'Row of month of interview',
 'Length of calendar',
 'Number of calendar columns',
 'Ever-married sample',
 'Sample strata for sampling errors',
 'Province',
 'Type of place of residence',
 'Number of visits',
 'Interviewer identification',
 'Field supervisor',
 'Line number of husband',
 'Cluster altitude in meters',
 'Household selected for hemoglobin',
 'Selected for Domestic Violence module',
 'Language of questionnaire',
 'Langua

In [43]:
dta_files[1] = dta_files[1].rename(columns={'Province': 'Region'})

In [44]:
common_columns = list(set.intersection(*[set(df.columns) for df in dta_files]))
dta_files = [df[common_columns] for df in dta_files]
clean_data = pd.concat(dta_files, axis=0, ignore_index=True)

In [45]:
clean_data['Region']

0        BARMM - Bangsamoro Autonomous Region in Muslim...
1        BARMM - Bangsamoro Autonomous Region in Muslim...
2        BARMM - Bangsamoro Autonomous Region in Muslim...
3        BARMM - Bangsamoro Autonomous Region in Muslim...
4        BARMM - Bangsamoro Autonomous Region in Muslim...
                               ...                        
25308                                         Tboung Khmum
25309                                         Tboung Khmum
25310                                         Tboung Khmum
25311                                         Tboung Khmum
25312                                         Tboung Khmum
Name: Region, Length: 25313, dtype: object

# Response Variable (y) Calculation

In [46]:
filename="dash.csv"
clean_data.to_csv(filename, index=False)