In [25]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pprint as pp

In [5]:
data = pd.ExcelFile("SOI_DataRequest_Deidentified for sending.xlsx")
data.sheet_names

['Data Specs',
 'TIN_Q1-5_TIN_Info',
 'TIN_Q7,8-Total_Cost_EnrollType',
 'TIN_Q9_TotalCost_By_Categories',
 'TIN_Q10-13_Event_Rate',
 'TIN_VISIT_Q8-14']

## Sheet 1: TIN_Q1-5_TIN_Info


In [17]:
first_sheet = pd.read_excel(data, sheet_name='TIN_Q1-5_TIN_Info')
first_sheet.head()

Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,ENROLLMENT_TYPE,BENE_CNT,AVR_RISK_SCORE
0,2022,MKLO,866020,ESRD,<11,1.861
1,2022,MKLO,866020,Unknown,109,0.736
2,2022,MKLO,866020,Aged-Dual,344,0.876
3,2022,MKLO,866020,Disabled,713,0.863
4,2022,MKLO,866020,Aged,2868,0.94


### Handling Standardization and Categorical variables

- Since 'PERFORMANCE_YEAR' is already numerical and represents a specific year, it doesn't need to be categorized or standardized.
- 'QRO' and 'TIN_CCN' are categorical but likely identifiers, so we will encode them if needed later on.
- 'ENROLLMENT_TYPE' is categorical and should be converted into a category type.
- 'BENE_CNT' can be left as numerical but '<11' values need to be addressed.
- 'AVR_RISK_SCORE' is numerical and might need to be standardized.


In [49]:
# Pre-processing for 'ENROLLMENT_TYPE':
from sklearn.preprocessing import StandardScaler
first_sheet['ENROLLMENT_TYPE'] = first_sheet['ENROLLMENT_TYPE'].astype(
    'category')

# Handling '<11' in 'BENE_CNT':
# For now, we will replace '<11' with 10 for simplicity.
first_sheet['BENE_CNT'] = first_sheet['BENE_CNT'].replace(
    '<11', 10).astype(int)

# Standardizing 'AVR_RISK_SCORE' (mean=0, standard deviation=1)

# Reshaping the 'AVR_RISK_SCORE' for standardization as it expects 2D array
risk_scores = first_sheet['AVR_RISK_SCORE'].values.reshape(-1, 1)
scaler = StandardScaler()
first_sheet['AVR_RISK_SCORE_STANDARDIZED'] = scaler.fit_transform(risk_scores)

# Checking the pre-processed data
first_sheet

Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,ENROLLMENT_TYPE,BENE_CNT,AVR_RISK_SCORE,AVR_RISK_SCORE_STANDARDIZED
0,2022,MKLO,866020,ESRD,10,1.861,2.117089
1,2022,MKLO,866020,Unknown,109,0.736,-0.495252
2,2022,MKLO,866020,Aged-Dual,344,0.876,-0.170161
3,2022,MKLO,866020,Disabled,713,0.863,-0.200348
4,2022,MKLO,866020,Aged,2868,0.940,-0.021548
...,...,...,...,...,...,...,...
1130,2023,UPLN,430950,Unknown,24,1.787,1.945255
1131,2023,UPLN,430950,ESRD,224,1.695,1.731624
1132,2023,UPLN,430950,Aged-Dual,965,0.997,0.110811
1133,2023,UPLN,430950,Disabled,2127,1.263,0.728485


### Handling missing values


In [50]:
# Checking for missing values in the dataset

# For 'ENROLLMENT_TYPE' which is categorical, we'll fill NaN with 'Unknown'
first_sheet['ENROLLMENT_TYPE'].fillna('Unknown', inplace=True)


# For numerical columns ('BENE_CNT', 'AVR_RISK_SCORE', and 'AVR_RISK_SCORE_STANDARDIZED'),
# we'll fill NaN with the median value of each column
for column in ['BENE_CNT', 'AVR_RISK_SCORE', 'AVR_RISK_SCORE_STANDARDIZED']:
    median_value = first_sheet[column].median()
    first_sheet[column].fillna(median_value, inplace=True)


# Checking the data after filling NaN values
pp.pprint(first_sheet.isnull().sum())
first_sheet.head()

PERFORMANCE_YEAR               0
QRO                            0
TIN_CCN                        0
ENROLLMENT_TYPE                0
BENE_CNT                       0
AVR_RISK_SCORE                 0
AVR_RISK_SCORE_STANDARDIZED    0
dtype: int64


Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,ENROLLMENT_TYPE,BENE_CNT,AVR_RISK_SCORE,AVR_RISK_SCORE_STANDARDIZED
0,2022,MKLO,866020,ESRD,10,1.861,2.117089
1,2022,MKLO,866020,Unknown,109,0.736,-0.495252
2,2022,MKLO,866020,Aged-Dual,344,0.876,-0.170161
3,2022,MKLO,866020,Disabled,713,0.863,-0.200348
4,2022,MKLO,866020,Aged,2868,0.94,-0.021548


### Checking other issues

1. Duplicates - Check if there are any duplicate rows in the data.
2. Invalid data - Check for any instances that don't match expected patterns or ranges, such as negative counts.
3. Inconsistent categories - Check if categorical variables have consistent and expected categories.


In [51]:
# Check for duplicate rows
duplicate_rows = first_sheet.duplicated().sum()

# Check for any negative counts in 'BENE_CNT' which should be a positive count
negative_bene_cnt = (first_sheet['BENE_CNT'] < 0).sum()

# Check for negative or unrealistic values in 'AVR_RISK_SCORE'
# Assuming risk score should be a non-negative value
negative_risk_scores = (first_sheet['AVR_RISK_SCORE'] < 0).sum()

# Check for inconsistent categories in 'ENROLLMENT_TYPE'
# Since we already cleaned up the NaN values, we assume all categories are now consistent
inconsistent_categories = first_sheet['ENROLLMENT_TYPE'].cat.categories

# Resolving the issues
# Remove duplicate rows if any
if duplicate_rows > 0:
    first_sheet = first_sheet.drop_duplicates()

# Handling negative 'BENE_CNT'
# Here we will assume a negative count is a data entry error and will replace it with the median of the positive counts
if negative_bene_cnt > 0:
    median_positive_bene_cnt = first_sheet[first_sheet['BENE_CNT'] > 0]['BENE_CNT'].median(
    )
    first_sheet['BENE_CNT'] = first_sheet['BENE_CNT'].apply(
        lambda x: median_positive_bene_cnt if x < 0 else x)

# Handling negative 'AVR_RISK_SCORE'
# Similar to 'BENE_CNT', we will replace negative risk scores with the median of the non-negative scores
if negative_risk_scores > 0:
    median_positive_risk_score = first_sheet[first_sheet['AVR_RISK_SCORE']
                                             >= 0]['AVR_RISK_SCORE'].median()
    first_sheet['AVR_RISK_SCORE'] = first_sheet['AVR_RISK_SCORE'].apply(
        lambda x: median_positive_risk_score if x < 0 else x)

# Checking for anomalies in 'ENROLLMENT_TYPE'
category_anomalies = first_sheet['ENROLLMENT_TYPE'].cat.categories

# Returning a summary of the checks and the categories of 'ENROLLMENT_TYPE'
(duplicate_rows, negative_bene_cnt, negative_risk_scores, category_anomalies)

(0,
 0,
 0,
 Index(['Aged', 'Aged-Dual', 'Disabled', 'ESRD', 'Unknown'], dtype='object'))

Here's a summary of the checks performed to identify potential issues in the data:

- Duplicates: There were no duplicate rows in the dataset.
- Negative Counts in 'BENE_CNT': There were no negative counts, which means all beneficiary counts are non-negative as expected.
- Negative or Unrealistic 'AVR_RISK_SCORE': There were no negative risk scores, which means all average risk scores are non-negative as expected.
- Inconsistent Categories in 'ENROLLMENT_TYPE': The categories are consistent and as expected: 'Aged', 'Aged-Dual', 'Disabled', 'ESRD', and 'Unknown'.


## Sheet 2: TIN_Q7,8-Total_Cost_EnrollType


In [52]:
second_sheet = pd.read_excel(data, sheet_name=2)
second_sheet.head()

Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,TOTAL_SPEND,AGED_TOTAL_SPEND,DISABLED_TOTAL_SPEND,ESRD_TOTAL_ESRD,AGED_DUAL_TOTAL_SPEND
0,2019,MTIP,122406,18332.78,1755.58,16577.2,,
1,2019,MTIP,679036,122.3,122.3,,,
2,2019,MTIP,344888,8.79,,,8.79,
3,2019,MTIP,125904,326079.83,165138.7,61145.21,99795.92,
4,2019,MTIP,183834,51693176.93,36549230.25,7356027.2,3118860.85,4668087.62


In [53]:
# Check for missing values and determine how to handle them
missing_values_third_sheet = second_sheet.isnull().sum()

# Since the spending variables are numerical, it would be appropriate to fill missing values with a statistical measure.
# In this case, we will fill missing values with 0, assuming that no entry means no spending occurred.
second_sheet.fillna(0, inplace=True)

# Check for any negative spending values, which might be data entry errors
negative_spending_columns = second_sheet.columns[second_sheet.columns.str.contains(
    '_SPEND')]
negative_spending_values = (second_sheet[negative_spending_columns] < 0).any()

# If there are negative spending values, we'll set them to 0 for simplicity
# This is a simplistic approach and might need to be adjusted based on domain knowledge or further analysis.
if negative_spending_values.any():
    for column in negative_spending_columns:
        second_sheet[column] = second_sheet[column].apply(
            lambda x: 0 if x < 0 else x)

# Check for duplicates
duplicates_second_sheet = second_sheet.duplicated().sum()

# Remove duplicates if any are found
if duplicates_second_sheet > 0:
    second_sheet.drop_duplicates(inplace=True)

# Summary of the pre-processing steps
preprocessing_summary = {
    "missing_values_filled_with_zero": missing_values_third_sheet.to_dict(),
    "negative_values_handled": negative_spending_values.to_dict(),
    "duplicates_removed": duplicates_second_sheet
}

pp.pprint(preprocessing_summary)
second_sheet.head()

{'duplicates_removed': 0,
 'missing_values_filled_with_zero': {'AGED_DUAL_TOTAL_SPEND': 103,
                                     'AGED_TOTAL_SPEND': 23,
                                     'DISABLED_TOTAL_SPEND': 48,
                                     'ESRD_TOTAL_ESRD': 230,
                                     'PERFORMANCE_YEAR': 0,
                                     'QRO': 0,
                                     'TIN_CCN': 0,
                                     'TOTAL_SPEND': 0},
 'negative_values_handled': {'AGED_DUAL_TOTAL_SPEND': False,
                             'AGED_TOTAL_SPEND': False,
                             'DISABLED_TOTAL_SPEND': False,
                             'TOTAL_SPEND': False}}


Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,TOTAL_SPEND,AGED_TOTAL_SPEND,DISABLED_TOTAL_SPEND,ESRD_TOTAL_ESRD,AGED_DUAL_TOTAL_SPEND
0,2019,MTIP,122406,18332.78,1755.58,16577.2,0.0,0.0
1,2019,MTIP,679036,122.3,122.3,0.0,0.0,0.0
2,2019,MTIP,344888,8.79,0.0,0.0,8.79,0.0
3,2019,MTIP,125904,326079.83,165138.7,61145.21,99795.92,0.0
4,2019,MTIP,183834,51693176.93,36549230.25,7356027.2,3118860.85,4668087.62


In [54]:
# Check for any spending values that are excessively high, which might indicate outliers or data entry errors.
# Here we use a simple heuristic that flags any spending values greater than a threshold.
# For the purpose of this analysis, we'll flag any values that are more than three standard deviations from the mean.
thresholds = second_sheet[negative_spending_columns].mean(
) + (3 * second_sheet[negative_spending_columns].std())
potential_outliers = (
    second_sheet[negative_spending_columns] > thresholds).any()

# Check the integrity of the data by confirming that the total spend is equal to or greater than the sum of the subcategory spends.
# This ensures that subcategory spends do not exceed the total spend.
subcategory_columns = ['AGED_TOTAL_SPEND', 'DISABLED_TOTAL_SPEND',
                       'ESRD_TOTAL_ESRD', 'AGED_DUAL_TOTAL_SPEND']
second_sheet['SUBCATEGORY_SUM'] = second_sheet[subcategory_columns].sum(axis=1)
spend_integrity_issues = (
    second_sheet['SUBCATEGORY_SUM'] > second_sheet['TOTAL_SPEND']).any()


# TODO: How to resolve spend integrity issues?

potential_outliers, spend_integrity_issues

(TOTAL_SPEND              True
 AGED_TOTAL_SPEND         True
 DISABLED_TOTAL_SPEND     True
 AGED_DUAL_TOTAL_SPEND    True
 dtype: bool,
 True)

## Third Sheet: TIN_Q9_TotalCost_By_Categories


In [55]:
third_sheet = pd.read_excel(data, sheet_name=3)
third_sheet.head()

Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,SUM_IP_SPEND,SUM_FAC_OP_SPEND,SUM_PHY_SPEND,SUM_POSTACUTE_SPEND,SUM_HOSPICE_SPEND,SUM_DME_SPEND
0,2019,MTIP,183834,19192504.7,16020228.69,9062788.08,4477877.38,1320060.3,1349982.92
1,2019,MTIP,834843,5364179.15,4602899.62,3152931.99,1253099.57,433572.46,548635.03
2,2019,MTIP,122406,,11329.4,7003.38,,,
3,2019,MTIP,344888,,8.79,0.0,,,
4,2019,MTIP,125904,48225.71,134299.7,101493.16,12408.65,,25392.39


In [56]:
# Conducting data pre-processing for the fourth sheet.

# Handling missing values: Assuming that no entry (NaN) means no spending occurred in that category, we will fill missing values with 0.
third_sheet.fillna(0, inplace=True)

# Check for any negative spending values, which might be data entry errors
spending_columns_third_sheet = third_sheet.columns[third_sheet.columns.str.startswith(
    'SUM_')]
negative_spending_values_third_sheet = (
    third_sheet[spending_columns_third_sheet] < 0).any()

# If there are negative spending values, we'll set them to 0 for simplicity
if negative_spending_values_third_sheet.any():
    for column in spending_columns_third_sheet:
        third_sheet[column] = third_sheet[column].apply(
            lambda x: 0 if x < 0 else x)

# Check for duplicates
duplicates_fourth_sheet = third_sheet.duplicated().sum()
# Remove duplicates if any are found
if duplicates_fourth_sheet > 0:
    third_sheet.drop_duplicates(inplace=True)

# Check for outliers using the same heuristic as before: any values more than three standard deviations from the mean.
outliers_thresholds_fourth_sheet = third_sheet[spending_columns_third_sheet].mean() + \
    (3 * third_sheet[spending_columns_third_sheet].std())
potential_outliers_fourth_sheet = (
    third_sheet[spending_columns_third_sheet] > outliers_thresholds_fourth_sheet).any()

# Returning a summary of the pre-processing steps and the head of the cleaned dataframe
preprocessing_summary_fourth_sheet = {
    "missing_values_filled_with_zero": third_sheet.isnull().sum().to_dict(),
    "negative_values_set_to_zero": negative_spending_values_third_sheet.to_dict(),
    "duplicates_removed": duplicates_fourth_sheet,
    "potential_outliers_identified": potential_outliers_fourth_sheet.to_dict()
}

pp.pprint(preprocessing_summary_fourth_sheet)
third_sheet.head()

{'duplicates_removed': 0,
 'missing_values_filled_with_zero': {'PERFORMANCE_YEAR': 0,
                                     'QRO': 0,
                                     'SUM_DME_SPEND': 0,
                                     'SUM_FAC_OP_SPEND': 0,
                                     'SUM_HOSPICE_SPEND': 0,
                                     'SUM_IP_SPEND': 0,
                                     'SUM_PHY_SPEND': 0,
                                     'SUM_POSTACUTE_SPEND': 0,
                                     'TIN_CCN': 0},
 'negative_values_set_to_zero': {'SUM_DME_SPEND': False,
                                 'SUM_FAC_OP_SPEND': False,
                                 'SUM_HOSPICE_SPEND': False,
                                 'SUM_IP_SPEND': False,
                                 'SUM_PHY_SPEND': False,
                                 'SUM_POSTACUTE_SPEND': False},
 'potential_outliers_identified': {'SUM_DME_SPEND': True,
                                   'SUM_FAC_OP_S

Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,SUM_IP_SPEND,SUM_FAC_OP_SPEND,SUM_PHY_SPEND,SUM_POSTACUTE_SPEND,SUM_HOSPICE_SPEND,SUM_DME_SPEND
0,2019,MTIP,183834,19192504.7,16020228.69,9062788.08,4477877.38,1320060.3,1349982.92
1,2019,MTIP,834843,5364179.15,4602899.62,3152931.99,1253099.57,433572.46,548635.03
2,2019,MTIP,122406,0.0,11329.4,7003.38,0.0,0.0,0.0
3,2019,MTIP,344888,0.0,8.79,0.0,0.0,0.0,0.0
4,2019,MTIP,125904,48225.71,134299.7,101493.16,12408.65,0.0,25392.39


- Missing values were filled with zeros, assuming that a lack of entry indicates no spending in that category.
- Checked for negative values and confirmed there were none; all spending amounts are non-negative.
- No duplicate rows were found.
- Potential outliers were identified using a heuristic where spending values greater than three standard deviations from the mean are flagged.


## Fourth Sheet: TIN_Q10-13_Event_Rate


In [57]:
fourth_sheet = pd.read_excel(data, sheet_name=4)
fourth_sheet.head()

Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,ED_VISIT_RATE,ED_RETURN_72HRS,IP_ADMIT_RATE,READMIT_30DAY_RATE
0,2019,MTIP,183834,797.1,0.758,307.5,0.139
1,2019,MTIP,834843,688.6,0.696,230.1,0.094
2,2019,MTIP,122406,166.7,1.0,,
3,2019,MTIP,125904,1062.5,0.735,218.8,
4,2019,MTIP,496926,1004.5,0.779,354.0,0.145


In [58]:
# Pre-processing for the "fourth_sheet" (known as fifth sheet in order of Excel sheets)

# Handling missing values: Assuming that no entry (NaN) means the data was not recorded, we will fill missing values with the mean.
# This approach assumes that the missingness is random and that the mean is a reasonable estimate of the missing values.
metrics_columns = ['ED_VISIT_RATE', 'ED_RETURN_72HRS',
                   'IP_ADMIT_RATE', 'READMIT_30DAY_RATE']
for column in metrics_columns:
    mean_value = fourth_sheet[column].mean()
    fourth_sheet[column].fillna(mean_value, inplace=True)

# Check for any negative rates, which might be data entry errors
negative_rates = (fourth_sheet[metrics_columns] < 0).any()

# If there are negative rates, we'll set them to the mean for simplicity
if negative_rates.any():
    for column in metrics_columns:
        mean_value = fourth_sheet[column].mean()
        fourth_sheet[column] = fourth_sheet[column].apply(
            lambda x: mean_value if x < 0 else x)

# Check for duplicates
duplicates = fourth_sheet.duplicated().sum()
# Remove duplicates if any are found
if duplicates > 0:
    fourth_sheet.drop_duplicates(inplace=True)

# Check for outliers using the same heuristic as before: any values more than three standard deviations from the mean.
outliers_thresholds = fourth_sheet[metrics_columns].mean() + \
    (3 * fourth_sheet[metrics_columns].std())
potential_outliers = (
    fourth_sheet[metrics_columns] > outliers_thresholds).any()

# Returning a summary of the pre-processing steps and the head of the cleaned dataframe
preprocessing_summary_fifth_sheet = {
    "missing_values_filled_with_mean": fourth_sheet.isnull().sum().to_dict(),
    "negative_values_set_to_mean": negative_rates.to_dict(),
    "duplicates_removed": duplicates,
    "potential_outliers_identified": potential_outliers.to_dict()
}

pp.pprint(preprocessing_summary_fifth_sheet)
fourth_sheet.head()

{'duplicates_removed': 0,
 'missing_values_filled_with_mean': {'ED_RETURN_72HRS': 0,
                                     'ED_VISIT_RATE': 0,
                                     'IP_ADMIT_RATE': 0,
                                     'PERFORMANCE_YEAR': 0,
                                     'QRO': 0,
                                     'READMIT_30DAY_RATE': 0,
                                     'TIN_CCN': 0},
 'negative_values_set_to_mean': {'ED_RETURN_72HRS': False,
                                 'ED_VISIT_RATE': False,
                                 'IP_ADMIT_RATE': False,
                                 'READMIT_30DAY_RATE': False},
 'potential_outliers_identified': {'ED_RETURN_72HRS': False,
                                   'ED_VISIT_RATE': True,
                                   'IP_ADMIT_RATE': True,
                                   'READMIT_30DAY_RATE': True}}


Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,ED_VISIT_RATE,ED_RETURN_72HRS,IP_ADMIT_RATE,READMIT_30DAY_RATE
0,2019,MTIP,183834,797.1,0.758,307.5,0.139
1,2019,MTIP,834843,688.6,0.696,230.1,0.094
2,2019,MTIP,122406,166.7,1.0,278.515108,0.137765
3,2019,MTIP,125904,1062.5,0.735,218.8,0.137765
4,2019,MTIP,496926,1004.5,0.779,354.0,0.145


- Missing values were filled with the mean of their respective columns.
- There were no negative values in the rate columns; all rates are non-negative as expected.
- No duplicate rows were found in the sheet.
- Potential outliers have been identified in "ED_VISIT_RATE", "IP_ADMIT_RATE", and "READMIT_30DAY_RATE".

## Fifth Sheet: TIN_VISIT_Q8-14

In [59]:
# Load the last sheet of the Excel file to review the data, which we will refer to as "fifth_sheet"
fifth_sheet = pd.read_excel(data, sheet_name=5)

# Display the first few rows to understand the structure and summarize the contents
fifth_sheet.head()


Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,AWV_RATE,SUM_PCP_VISITS,CNT_PCP_VISITS,SUM_SPEC_VISITS,CNT_SPEC_VISITS
0,2019,MTIP,122406,0.0,988.59,10,460.16,12
1,2019,MTIP,679036,0.0,122.3,1,0.0,0
2,2019,MTIP,344888,0.0,8.79,1,0.0,0
3,2019,MTIP,166870,0.0,0.0,0,0.0,0
4,2019,MTIP,125904,0.438,26994.97,260,9199.88,153


In [60]:
# Handling missing values: Assuming that no entry (NaN) means the data was not recorded, we will fill missing values with 0.
# This assumes that a lack of record means no visits or costs were associated.
visit_columns = ['AWV_RATE', 'SUM_PCP_VISITS', 'CNT_PCP_VISITS', 'SUM_SPEC_VISITS', 'CNT_SPEC_VISITS']
fifth_sheet.fillna(0, inplace=True)

# Check for any negative values in the columns, which might be data entry errors
negative_values = (fifth_sheet[visit_columns] < 0).any()

# If there are negative values, we'll set them to 0 for simplicity
if negative_values.any():
    for column in visit_columns:
        fifth_sheet[column] = fifth_sheet[column].apply(lambda x: 0 if x < 0 else x)

# Check for duplicates
duplicates = fifth_sheet.duplicated().sum()
# Remove duplicates if any are found
if duplicates > 0:
    fifth_sheet.drop_duplicates(inplace=True)

# Check for outliers using the same heuristic as before: any values more than three standard deviations from the mean.
outliers_thresholds = fifth_sheet[visit_columns].mean() + \
                                   (3 * fifth_sheet[visit_columns].std())
potential_outliers = (fifth_sheet[visit_columns] > outliers_thresholds).any()

# Returning a summary of the pre-processing steps and the head of the cleaned dataframe
preprocessing_summary_last_sheet = {
    "missing_values_filled_with_zero": fifth_sheet.isnull().sum().to_dict(),
    "negative_values_set_to_zero": negative_values.to_dict(),
    "duplicates_removed": duplicates,
    "potential_outliers_identified": potential_outliers.to_dict()
}
pp.pprint(preprocessing_summary_last_sheet)
fifth_sheet.head()

{'duplicates_removed': 0,
 'missing_values_filled_with_zero': {'AWV_RATE': 0,
                                     'CNT_PCP_VISITS': 0,
                                     'CNT_SPEC_VISITS': 0,
                                     'PERFORMANCE_YEAR': 0,
                                     'QRO': 0,
                                     'SUM_PCP_VISITS': 0,
                                     'SUM_SPEC_VISITS': 0,
                                     'TIN_CCN': 0},
 'negative_values_set_to_zero': {'AWV_RATE': False,
                                 'CNT_PCP_VISITS': False,
                                 'CNT_SPEC_VISITS': False,
                                 'SUM_PCP_VISITS': False,
                                 'SUM_SPEC_VISITS': False},
 'potential_outliers_identified': {'AWV_RATE': False,
                                   'CNT_PCP_VISITS': True,
                                   'CNT_SPEC_VISITS': True,
                                   'SUM_PCP_VISITS': True,
          

Unnamed: 0,PERFORMANCE_YEAR,QRO,TIN_CCN,AWV_RATE,SUM_PCP_VISITS,CNT_PCP_VISITS,SUM_SPEC_VISITS,CNT_SPEC_VISITS
0,2019,MTIP,122406,0.0,988.59,10,460.16,12
1,2019,MTIP,679036,0.0,122.3,1,0.0,0
2,2019,MTIP,344888,0.0,8.79,1,0.0,0
3,2019,MTIP,166870,0.0,0.0,0,0.0,0
4,2019,MTIP,125904,0.438,26994.97,260,9199.88,153


- Missing values have been filled with zeros, under the assumption that no data recorded indicates no occurrence of the event.
- Checked and confirmed that there are no negative values for any of the rates or counts, ensuring all values are non-negative.
- No duplicate rows were found.
- Potential outliers have been identified in the "SUM_PCP_VISITS", "CNT_PCP_VISITS", "SUM_SPEC_VISITS", and "CNT_SPEC_VISITS" columns. These may represent legitimate extremes or errors and would typically be investigated further in consultation with domain experts.

In [63]:
# Combine all the pre-processed sheets into a single Excel file

# Since we've loaded and processed the data separately, we'll compile the dataframes into a dictionary
sheets_dict = {
    'TIN_Q1-5_TIN_Info': first_sheet,
    'TIN_Q7,8-Total_Cost_EnrollType': second_sheet,
    'TIN_Q9_TotalCost_By_Categories': third_sheet,
    'TIN_Q10-13_Event_Rate': fourth_sheet,
    'TIN_VISIT_Q8-14': fifth_sheet
}

# Create a new Excel writer object and save each sheet
output_excel_path = 'Data_Preprocessed.xlsx'
with pd.ExcelWriter(output_excel_path) as writer:
    for sheet_name, sheet_df in sheets_dict.items():
        sheet_df.to_excel(writer, sheet_name=sheet_name, index=False)


In [88]:
from functools import reduce

# List of dataframes to merge
dataframes = [second_sheet, third_sheet, fourth_sheet, fifth_sheet]
keys = ['PERFORMANCE_YEAR', 'QRO', 'TIN_CCN']

# Verify all dataframes have the keys for merging
consistent_keys_check = all(all(key in df.columns for key in keys) for df in dataframes)

if consistent_keys_check:
    # Perform an outer merge to ensure no data is lost from any dataframe
    integrated_df = reduce(lambda left, right: pd.merge(left, right, on=keys, how='outer'), dataframes)
else:
    integrated_df = None

# integrated_df now contains the merged data if the keys were consistent
