# Load Data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

file_path = 'drive/MyDrive/DS Proj (Team 25)/PHI_Individual(PHIR82DT)/PHIR82FL.DTA' #adjust the path when you run on your own laptop
df = pd.read_stata(file_path, convert_categoricals= False)

df

Unnamed: 0,caseid,v000,v001,v002,v003,v004,v005,v006,v007,v008,...,s615d_3,s615d_4,s615d_5,s615d_6,s617b_1,s617b_2,s617b_3,s617b_4,s617b_5,s617b_6
0,1 4 2,PH8,1,4,2,1,116381,5,2022,1469,...,,,,,,,,,,
1,1 4 3,PH8,1,4,3,1,116381,5,2022,1469,...,,,,,,,,,,
2,1 4 4,PH8,1,4,4,1,116381,5,2022,1469,...,,,,,,,,,,
3,1 6 2,PH8,1,6,2,1,116381,5,2022,1469,...,,,,,,,,,,
4,1 7 6,PH8,1,7,6,1,116381,5,2022,1469,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27816,1247 23 2,PH8,1247,23,2,1247,694550,5,2022,1469,...,,,,,,,,,,
27817,1247 23 4,PH8,1247,23,4,1247,694550,5,2022,1469,...,,,,,,,,,,
27818,1247 26 3,PH8,1247,26,3,1247,694550,5,2022,1469,...,,,,,,,,,,
27819,1247 26 7,PH8,1247,26,7,1247,694550,5,2022,1469,...,,,,,,,,,,


# EDA (Exploratory Data Analysis)

## Survey Answer Meaning

                               0  Never
                               1  Often
                               2  Sometimes
                               3  Yes, but not in the last 12 months
                               4  Yes, but frequency in last 12 months missing
                           (m) 9  Missing
                          (na)    Not applicable

Logic:
1. Technically, the meanings of 'missing' will be same as the 'na' value, here we assume they are together (tbh there is no value '9' in all the attribute columns)
2. Here, we are still using the 'smaller means more serious'. The computational method will be: SUM(all attribute values) / # not na or 9 or 0 col = avg of the valid answer
3. This algo should apply for each type of violence and the overall one
??: the formula should consider the number of vaild answer

In [None]:
#the current data w/o the all na row
def check_column_distribution(dataframe):
    """
    Prints the distribution of values for each column in the dataframe.

    Parameters:
    dataframe (pd.DataFrame): The dataframe to check.
    """
    for col in dataframe.columns:
        try:
            print(f"Distribution for column: {col}")
            print(dataframe[col].value_counts(dropna=False))
        except ValueError as e:
            print(f"Could not process column '{col}' due to: {str(e)}")
        print("-" * 50)

In [None]:
check_column_distribution(df)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
--------------------------------------------------
Distribution for column: d115v
d115v
0.0    19156
NaN     8662
1.0        3
Name: count, dtype: int64
--------------------------------------------------
Distribution for column: d115w
d115w
0.0    19141
NaN     8662
1.0       18
Name: count, dtype: int64
--------------------------------------------------
Distribution for column: d115x
d115x
0.0    19088
NaN     8662
1.0       71
Name: count, dtype: int64
--------------------------------------------------
Distribution for column: d115y
d115y
1.0    17867
NaN     8593
0.0     1292
6.0       69
Name: count, dtype: int64
--------------------------------------------------
Distribution for column: d115xa
d115xa
NaN    27821
Name: count, dtype: int64
--------------------------------------------------
Distribution for column: d115xb
d115xb
NaN    27821
Name: count, dtype: int64
--------------------------------------------------
D

# Data Cleaning

In [None]:
# def data_cleaning(raw_data_path, attributes_list_path):
#     #read the raw csv data set
#     #27821 * 5533
#     whole_data = pd.read_csv(raw_data_path)
#     attributes_list = pd.read_csv(attributes_list_path)

#     attributes_list['Column ID'] = attributes_list['Column ID'].str.lower()

#     #extract the list of column IDs
#     column_ids = attributes_list['Column ID'].tolist()
#     #print(len(column_ids))

#     #locate which columns from column_ids exist in main_dataset
#     existing_columns = [col for col in column_ids if col in whole_data.columns]
#     missing_columns = [col for col in column_ids if col not in whole_data.columns]

#     #select only the existing columns from the main dataset
#     filtered_dataset = whole_data[existing_columns]

#     #extract the domestic violence related dataset
#     violence_data = whole_data[whole_data.columns[pd.Series(df.columns).str.startswith('d')]]

#     #drop the NA row
#     cleaned_violence_data = filtered_dataset.dropna(how='all')

#     #reset the index so that can be tracked by ID later
#     cleaned_violence_data = cleaned_violence_data.reset_index(drop=True)
#     cleaned_violence_data['unique_id'] = cleaned_violence_data.index + 1
#     cols = ['unique_id'] + [col for col in cleaned_violence_data.columns if col != 'unique_id']
#     cleaned_violence_data = cleaned_violence_data[cols]
#     return cleaned_violence_data

In [3]:
emotional_list = ['d101a', 'd101b', 'd101c', 'd101d', 'd101e', 'd101f', 'd103a', 'd103b']
physical_list = ['d105a', 'd105b', 'd105c', 'd105d', 'd105e', 'd105f', 'd105g']
sexual_list = ['d105h', 'd105i', 'd105k']

In [4]:
attribute_lists = {
    'emotional': emotional_list,
    'physical': physical_list,
    'sexual': sexual_list
}

In [5]:
#new
def data_cleaning(raw_data_path, attributes_list_path, attribute_lists):
    # Read the raw CSV dataset
    whole_data = pd.read_csv(raw_data_path)
    attributes_list = pd.read_csv(attributes_list_path)

    # Convert column names to lowercase
    whole_data.columns = whole_data.columns.str.lower()
    attributes_list['Column ID'] = attributes_list['Column ID'].str.lower()

    # Extract the list of column IDs
    column_ids = attributes_list['Column ID'].tolist()

    # Locate which columns from column_ids exist in the main dataset
    existing_columns = [col for col in column_ids if col in whole_data.columns]
    missing_columns = [col for col in column_ids if col not in whole_data.columns]

    # Select only the existing columns from the main dataset
    filtered_dataset = whole_data[existing_columns]

    # Drop the NA rows from the main dataset
    cleaned_data = filtered_dataset.dropna(how='all')

    # Separate rows where all values in the attribute lists are NA
    attribute_columns = [col for cols in attribute_lists.values() for col in cols if col in cleaned_data.columns]
    attribute_data = cleaned_data[attribute_columns]
    all_na_rows = cleaned_data[attribute_data.isna().all(axis=1)]
    cleaned_violence_data = cleaned_data[~cleaned_data.index.isin(all_na_rows.index)]

    # Reset the index so that it can be tracked by ID later
    cleaned_violence_data = cleaned_violence_data.reset_index(drop=True)
    cleaned_violence_data['unique_id'] = cleaned_violence_data.index + 1

    # Reorder columns to have 'unique_id' first
    cols = ['unique_id'] + [col for col in cleaned_violence_data.columns if col != 'unique_id']
    cleaned_violence_data = cleaned_violence_data[cols]

    return cleaned_violence_data, all_na_rows


In [6]:
raw_data_path = 'drive/MyDrive/DS Proj (Team 25)/PHI_Individual(PHIR82DT)/PHIR82FL.csv'
attributes_list_path = 'drive/MyDrive/DS Proj (Team 25)/Attributes_Template.csv'
#clean_data = data_cleaning(raw_data_path, attributes_list_path)
clean_data, all_na_rows = data_cleaning(raw_data_path, attributes_list_path, attribute_lists)

print(clean_data)

  whole_data = pd.read_csv(raw_data_path)


       unique_id v000  v001  v002  v003  v004    v005  d103e  d105h  d105h  \
0              1  PH8     1     6     2     1  116381    0.0    0.0    0.0   
1              2  PH8     1     7     6     1  116381    0.0    0.0    0.0   
2              3  PH8     1     8     2     1  116381    0.0    0.0    0.0   
3              4  PH8     1     9     2     1  116381    0.0    0.0    0.0   
4              5  PH8     1    17     2     1  116381    0.0    2.0    2.0   
...          ...  ...   ...   ...   ...   ...     ...    ...    ...    ...   
14581      14582  PH8  1247    17     3  1247  694550    0.0    0.0    0.0   
14582      14583  PH8  1247    20     2  1247  694550    0.0    0.0    0.0   
14583      14584  PH8  1247    23     2  1247  694550    0.0    0.0    0.0   
14584      14585  PH8  1247    26     7  1247  694550    0.0    0.0    0.0   
14585      14586  PH8  1247    27     2  1247  694550    0.0    0.0    0.0   

       ...  d105e  d105f  d105g  d105h  d105h  d105i  d105i  d1

In [7]:
print(all_na_rows)

      v000  v001  v002  v003  v004    v005  d103e  d105h  d105i  d105k  ...  \
0      PH8     1     4     2     1  116381    NaN    NaN    NaN    NaN  ...   
1      PH8     1     4     3     1  116381    NaN    NaN    NaN    NaN  ...   
2      PH8     1     4     4     1  116381    NaN    NaN    NaN    NaN  ...   
7      PH8     1    14     7     1  116381    NaN    NaN    NaN    NaN  ...   
13     PH8     1    27     2     1  116381    NaN    NaN    NaN    NaN  ...   
...    ...   ...   ...   ...   ...     ...    ...    ...    ...    ...  ...   
27809  PH8  1247     3     3  1247  694550    NaN    NaN    NaN    NaN  ...   
27810  PH8  1247     6     2  1247  694550    NaN    NaN    NaN    NaN  ...   
27811  PH8  1247     6     7  1247  694550    NaN    NaN    NaN    NaN  ...   
27817  PH8  1247    23     4  1247  694550    NaN    NaN    NaN    NaN  ...   
27818  PH8  1247    26     3  1247  694550    NaN    NaN    NaN    NaN  ...   

       d105b  d105c  d105d  d105e  d105f  d105g  d1

In [None]:
# Print the first row as a full string
#print(clean_data.iloc[0].to_string())


# Response Variable (y) Calculation

In [8]:
def compute_valid_and_sum(df, attribute_lists):
    # Initialize overall metrics
    df['overall_valid_count'] = 0
    df['overall_valid_sum'] = 0.0

    for name, columns in attribute_lists.items():
        existing_columns = [col for col in columns if col in df.columns]
        if not existing_columns:  # If no columns exist, set metrics to 0
            df[f'{name}_valid_count'] = 0
            df[f'{name}_valid_sum'] = 0.0
            df[f'{name}_avg'] = 0
        else:
            # Replace 0 and 9 with NaN to consider them as invalid
            valid_df = df[existing_columns].replace([0, 9], float('nan'))
            # Compute valid counts and sum for the attribute list
            df[f'{name}_valid_count'] = valid_df.notna().sum(axis=1)
            df[f'{name}_valid_sum'] = valid_df.sum(axis=1)
            # Calculate average, handling division by zero by filling NaN with 0
            df[f'{name}_avg'] = (df[f'{name}_valid_sum'] / df[f'{name}_valid_count']).fillna(0)

        # Update the overall valid count and sum
        df['overall_valid_count'] += df[f'{name}_valid_count']
        df['overall_valid_sum'] += df[f'{name}_valid_sum']

    # Compute the overall average and handle division by zero
    df['overall_avg'] = (df['overall_valid_sum'] / df['overall_valid_count']).fillna(0)

    return df


In [9]:
result = compute_valid_and_sum(clean_data, attribute_lists)
print(result)

       unique_id v000  v001  v002  v003  v004    v005  d103e  d105h  d105h  \
0              1  PH8     1     6     2     1  116381    0.0    0.0    0.0   
1              2  PH8     1     7     6     1  116381    0.0    0.0    0.0   
2              3  PH8     1     8     2     1  116381    0.0    0.0    0.0   
3              4  PH8     1     9     2     1  116381    0.0    0.0    0.0   
4              5  PH8     1    17     2     1  116381    0.0    2.0    2.0   
...          ...  ...   ...   ...   ...   ...     ...    ...    ...    ...   
14581      14582  PH8  1247    17     3  1247  694550    0.0    0.0    0.0   
14582      14583  PH8  1247    20     2  1247  694550    0.0    0.0    0.0   
14583      14584  PH8  1247    23     2  1247  694550    0.0    0.0    0.0   
14584      14585  PH8  1247    26     7  1247  694550    0.0    0.0    0.0   
14585      14586  PH8  1247    27     2  1247  694550    0.0    0.0    0.0   

       ...  emotional_valid_count  emotional_valid_sum  emotion

In [None]:
#the current data w/o the all na row
#check_column_distribution(clean_data)

In [None]:
# #print out all the col names
# column_names = result.columns.tolist()
# column_names

In [None]:
# def generate_y(violence_type, clean_data, new_col_name):
#     violence_list = [item.lower() for item in violence_type]
#     violence_data = clean_data[violence_list]
#     clean_data[new_col_name] = violence_data.sum(axis=1, skipna=True)
#     return None

# generate_y(emotional_list, clean_data, 'emotional(sum)')
# generate_y(physical_list, clean_data, 'physical(sum)')
# generate_y(sexual_list, clean_data, 'sexual(sum)')

# #the less value, the more serious case
# clean_data

In [None]:
# violence_list = ['emotional(sum)', 'physical(sum)', 'sexual(sum)']
# violence_data_list = clean_data[violence_list]
# clean_data['violence(overall sum)'] = violence_data_list.sum(axis=1, skipna=True)
# clean_data

In [13]:
def reverse_y(clean_data):
    # List of columns to reverse
    avg_columns = ['physical_avg', 'sexual_avg', 'emotional_avg', 'overall_avg']

    for col in avg_columns:
        # Compute the max and min, excluding zero values
        non_zero_values = clean_data[col][clean_data[col] != 0]

        if len(non_zero_values) == 0:
            # Skip if all values are zero
            print(f"Skipping {col} as all values are zero.")
            continue

        max_value = non_zero_values.max()
        min_value = non_zero_values.min()

        print(f"{col}: max_value = {max_value}, min_value = {min_value}")

        # Create a new column for the reversed values, only for non-zero entries
        reversed_col_name = f'{col}_reversed'

        # Apply the reverse logic only for non-zero values using a more typical reversal formula
        clean_data[reversed_col_name] = clean_data[col].apply(
            lambda x: max_value - (x - min_value) if x != 0 else 0
        )

        print(f"Reversed {col} values: \n{clean_data[reversed_col_name].head()}")

    return clean_data

# Apply the function
reverse = reverse_y(result)
reverse


physical_avg: max_value = 3.0, min_value = 1.0
Reversed physical_avg values: 
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: physical_avg_reversed, dtype: float64
sexual_avg: max_value = 3.0, min_value = 1.0
Reversed sexual_avg values: 
0    0.0
1    0.0
2    0.0
3    0.0
4    2.0
Name: sexual_avg_reversed, dtype: float64
emotional_avg: max_value = 3.0, min_value = 1.0
Reversed emotional_avg values: 
0    0.0
1    2.0
2    0.0
3    0.0
4    2.0
Name: emotional_avg_reversed, dtype: float64
overall_avg: max_value = 3.0, min_value = 1.0
Reversed overall_avg values: 
0    0.0
1    2.0
2    0.0
3    0.0
4    2.0
Name: overall_avg_reversed, dtype: float64


Unnamed: 0,unique_id,v000,v001,v002,v003,v004,v005,d103e,d105h,d105h.1,...,physical_valid_sum,physical_avg,sexual_valid_count,sexual_valid_sum,sexual_avg,overall_avg,physical_avg_reversed,sexual_avg_reversed,emotional_avg_reversed,overall_avg_reversed
0,1,PH8,1,6,2,1,116381,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,PH8,1,7,6,1,116381,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,2.0,0.0,0.0,2.0,2.0
2,3,PH8,1,8,2,1,116381,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,PH8,1,9,2,1,116381,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,PH8,1,17,2,1,116381,0.0,2.0,2.0,...,0.0,0.0,12,24.0,2.0,2.0,0.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,14582,PH8,1247,17,3,1247,694550,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14582,14583,PH8,1247,20,2,1247,694550,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,2.0,0.0,0.0,2.0,2.0
14583,14584,PH8,1247,23,2,1247,694550,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,3.0,0.0,0.0,1.0,1.0
14584,14585,PH8,1247,26,7,1247,694550,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
def reverse_y(clean_data):
    # List of columns to reverse
    avg_columns = ['physical_avg', 'sexual_avg', 'emotional_avg', 'overall_avg']

    for col in avg_columns:
        # Compute the max and min, excluding zero values
        non_zero_values = clean_data[col][clean_data[col] != 0]
        max_value = non_zero_values.max()
        min_value = non_zero_values.min()

        # Create a new column for the reversed values, only for non-zero entries
        reversed_col_name = f'{col}_reversed'

        clean_data[reversed_col_name] = clean_data[col].apply(
            lambda x: max_value + min_value - x if x != 0 else 0
        )

    return clean_data


In [11]:
#reverse_list = ['emotional(sum)', 'physical(sum)', 'sexual(sum)', 'violence(overall sum)']
reverse = reverse_y(result)
reverse

Unnamed: 0,unique_id,v000,v001,v002,v003,v004,v005,d103e,d105h,d105h.1,...,physical_valid_sum,physical_avg,sexual_valid_count,sexual_valid_sum,sexual_avg,overall_avg,physical_avg_reversed,sexual_avg_reversed,emotional_avg_reversed,overall_avg_reversed
0,1,PH8,1,6,2,1,116381,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,PH8,1,7,6,1,116381,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,2.0,0.0,0.0,2.0,2.0
2,3,PH8,1,8,2,1,116381,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,PH8,1,9,2,1,116381,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,PH8,1,17,2,1,116381,0.0,2.0,2.0,...,0.0,0.0,12,24.0,2.0,2.0,0.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,14582,PH8,1247,17,3,1247,694550,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14582,14583,PH8,1247,20,2,1247,694550,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,2.0,0.0,0.0,2.0,2.0
14583,14584,PH8,1247,23,2,1247,694550,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,3.0,0.0,0.0,1.0,1.0
14584,14585,PH8,1247,26,7,1247,694550,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
clean_data_sorted = reverse_clean_data.sort_values(by='overall_avg_reversed', ascending=False)

# Resetting the index
clean_data_sorted.reset_index(drop=True, inplace=True)

In [None]:
clean_data_sorted

Unnamed: 0,unique_id,v000,v001,v002,v003,v004,v005,d103e,d105h,d105h.1,...,physical_valid_sum,physical_avg,sexual_valid_count,sexual_valid_sum,sexual_avg,overall_avg,physical_avg_reversed,sexual_avg_reversed,emotional_avg_reversed,overall_avg_reversed
0,11512,PH8,1004,12,2,1004,842025,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,3.0,3.0
1,13340,PH8,1143,26,1,1143,2323259,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,3.0,3.0
2,5964,PH8,524,5,1,524,390771,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,3.0,3.0
3,1463,PH8,96,21,3,96,466419,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,3.0,3.0
4,10457,PH8,907,8,2,907,86873,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,1.0,0.0,0.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14581,5656,PH8,492,17,2,492,1800971,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14582,5657,PH8,492,20,2,492,1800971,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14583,5659,PH8,492,23,2,492,1800971,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14584,5660,PH8,493,2,1,493,1522470,0.0,0.0,0.0,...,0.0,0.0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
clean_data_sorted_transposed = clean_data_sorted.T.drop_duplicates().T

# Renaming columns to ensure uniqueness, if necessary
unique_columns = {}
for col in clean_data_sorted.columns:
    if col in unique_columns:
        unique_count = unique_columns[col] + 1
        unique_columns[col] = unique_count
        clean_data_sorted.rename(columns={col: f"{col}_{unique_count}"}, inplace=True)
    else:
        unique_columns[col] = 0

print(clean_data_sorted_transposed)
#unique_clean_data_sorted = remove_duplicate_columns(clean_data_sorted)

      unique_id v000  v001 v002 v003     v005 d103e d105h d105i d105k  ...  \
0         11512  PH8  1004   12    2   842025   0.0   0.0   0.0   0.0  ...   
1         13340  PH8  1143   26    1  2323259   0.0   0.0   0.0   0.0  ...   
2          5964  PH8   524    5    1   390771   0.0   0.0   0.0   0.0  ...   
3          1463  PH8    96   21    3   466419   0.0   0.0   0.0   0.0  ...   
4         10457  PH8   907    8    2    86873   0.0   0.0   0.0   0.0  ...   
...         ...  ...   ...  ...  ...      ...   ...   ...   ...   ...  ...   
14581      5656  PH8   492   17    2  1800971   0.0   0.0   0.0   0.0  ...   
14582      5657  PH8   492   20    2  1800971   0.0   0.0   0.0   0.0  ...   
14583      5659  PH8   492   23    2  1800971   0.0   0.0   0.0   0.0  ...   
14584      5660  PH8   493    2    1  1522470   0.0   0.0   0.0   0.0  ...   
14585      7294  PH8   634   27    1  4514101   0.0   0.0   0.0   0.0  ...   

      physical_valid_sum physical_avg sexual_valid_count sexual

# Metadata Mapping

In [None]:
import pandas as pd

def metadata(metadata_path):
    try:
        metadata = pd.read_csv(metadata_path, encoding='utf-8')
    except UnicodeDecodeError:
        # Try a different encoding if UTF-8 does not work
        metadata = pd.read_csv(metadata_path, encoding='ISO-8859-1')
    metadata['Item Name'] = metadata['Item Name'].apply(lambda x: x.lower() if isinstance(x, str) else x)
    return metadata

metadata_path = 'drive/MyDrive/DS Proj (Team 25)/variable_description.csv'
metadata = metadata(metadata_path)
metadata



Unnamed: 0,Item Name,Item Label,Code,Description
0,caseid,(id) Case Identification,,record type
1,v000,Country code and phase,,
2,v001,Cluster number,,
3,v002,Household number,,
4,v003,Respondent's line number,,
...,...,...,...,...
7358,s614a,Days after diarrhea begun sought advice or trea,0.065972222,Number of days
7359,s615d,Child given probiotic,0,No
7360,s615d,Child given probiotic,1,Yes
7361,s615d,Child given probiotic,8,Don't know


In [None]:
# Assuming your reference data is structured as given, you might need to adjust keys if they differ
# Create a dictionary for each column that you need to map
mappings = {}
for item in metadata['Item Name'].unique():
    mappings[item] = dict(zip(metadata[metadata['Item Name'] == item]['Code'],
                              metadata[metadata['Item Name'] == item]['Description']))


KeyboardInterrupt: 

In [None]:
def apply_mappings(data, mappings):
    """
    Apply mappings to specified columns of a DataFrame.

    Parameters:
    - data (pd.DataFrame): The DataFrame to be modified.
    - mappings (dict): A dictionary where keys are column names and values are dictionaries
                       mapping old values to new values.

    Returns:
    - pd.DataFrame: The modified DataFrame with mapped values.
    """
    for column, mapping in mappings.items():
        if column in data.columns:
            # Convert mapping keys from string to appropriate type, if possible
            try:

                mapping_int_keys = {int(k): v for k, v in mapping.items()}

            except ValueError:
                # If conversion fails, use the mapping as is
                mapping_int_keys = mapping

            # Apply the mapping to the DataFrame column using map on the specific series
            data[column] = data[column].map(mapping_int_keys).fillna(data[column])

    return data


map_data_table = apply_mappings(clean_data_sorted_transposed, mappings)


In [None]:
#clean_data_sorted_transposed.columns = clean_data_sorted_transposed.columns.str.lower()

# Convert the emotional_list to lowercase (if not already)
emotional_list_low = [col.lower() for col in emotional_list]

# Select and display the columns in emotional_list from clean_data
emotional_data = clean_data_sorted_transposed[emotional_list_low]
emotional_data.tail(80)

In [None]:
attributes = pd.read_csv(attributes_list_path)
attributes

In [None]:
# attributes['Column ID'] = attributes['Column ID'].apply(lambda x: x.lower() if isinstance(x, str) else x)
# # Create a dictionary for column renaming
# column_mapping = dict(zip(attributes['Column ID'], attributes['Column Name']))
# # Rename the columns based on the dictionary
# map_data_head = map_data_table.rename(columns=column_mapping, inplace=True)
# map_data_head

In [None]:
# #emotional violence
# emotional_list = ['D101A', 'D101B', 'D101C', 'D101D', 'D101E', 'D101F', 'D103A', 'D103B']

# #physical violence
# physical_list = ['D105A', 'D105B', 'D105C', 'D105D', 'D105E', 'D105F', 'D105G']

# #sexual violence
# sexual_list = ['d105h', 'd105i', 'd105k']

#clean_data_sorted_transposed.columns = clean_data_sorted_transposed.columns.str.lower()

# Convert the emotional_list to lowercase (if not already)
emotional_list_low = [col.lower() for col in emotional_list]
physical_list_low = [col.lower() for col in physical_list]
sexual_list_low = [col.lower() for col in sexual_list]

# Select and display the columns in emotional_list from clean_data
emotional_data = clean_data_sorted_transposed[emotional_list_low]
physical_data = clean_data_sorted_transposed[physical_list_low]
sexual_data = clean_data_sorted_transposed[sexual_list_low]

#emotional_data.tail(80)

1. 2 minimum sum (0 value)
2.

In [None]:
#save the dataset that was extracted
test.to_csv('drive/MyDrive/DS Proj (Team 25)/Extracted_Dataset.csv', index=False)