In [5]:
import pandas as pd
from datacompy import Compare, render


file1 = "Old.xlsx"
file2 = "New_3.xlsx"

def read_data(file):
    """Reads data from an uploaded file based on its extension."""
    if file.endswith('.xlsx'):
        return pd.read_excel(file)
    elif file.endswith('.csv'):
        return pd.read_csv(file)
    else:
        raise ValueError("Unsupported file format. Please upload a CSV or Excel file.")

class DatalyCompare(Compare):
    def Regression_report(
        self,
        sample_count: int = 10,
    ) -> str:
        def df_to_str(pdf: pd.DataFrame) -> str:
            if not self.on_index:
                pdf = pdf.reset_index(drop=True)
            return pdf.to_string()

        # Header
        report = ("Dataly Regression Test Output")
        report += "\n"
        report += "--------------------"
        report += "\n\n"
        report += "DataFrame Summary"
        report += "\n"
        report += "-----------------"
        report += "\n\n"
        df_header = pd.DataFrame(
            {
                "DataFrame": [self.df1_name, self.df2_name],
                "Columns": [self.df1.shape[1], self.df2.shape[1]],
                "Rows": [self.df1.shape[0], self.df2.shape[0]],
            }
        )
        report += df_header[["DataFrame", "Columns", "Rows"]].to_string()
        report += "\n\n"

        # Row Summary
        if self.on_index:
            match_on = "index"
        else:
            match_on = ", ".join(self.join_columns)
        report += render(
            "row_summary.txt",
            match_on,
            self.abs_tol,
            self.rel_tol,
            self.intersect_rows.shape[0],
            self.df1_unq_rows.shape[0],
            self.df2_unq_rows.shape[0],
            self.intersect_rows.shape[0] - self.count_matching_rows(),
            self.count_matching_rows(),
            self.df1_name,
            self.df2_name,
            "Yes" if self._any_dupes else "No",
        )

        match_stats = []
        match_sample = []
        any_mismatch = False
        for column in self.column_stats:
            if not column["all_match"]:
                any_mismatch = True
                match_stats.append(
                    {
                        "Column": column["column"],
                        f"{self.df1_name} dtype": column["dtype1"],
                        f"{self.df2_name} dtype": column["dtype2"],
                        "# Unequal": column["unequal_cnt"],
                        "Max Diff": column["max_diff"],
                        "# Null Diff": column["null_diff"],
                    }
                )
                if column["unequal_cnt"] > 0:
                    match_sample.append(
                        self.sample_mismatch(
                            column["column"], sample_count, for_display=True
                        )
                    )

        if any_mismatch:
            report += "Columns with Unequal Values or Types\n"
            report += "------------------------------------\n"
            report += "\n"
            df_match_stats = pd.DataFrame(match_stats)
            df_match_stats.sort_values("Column", inplace=True)
            # Have to specify again for sorting
            report += df_match_stats[
                [
                    "Column",
                    f"{self.df1_name} dtype",
                    f"{self.df2_name} dtype",
                    "# Unequal",
                    "Max Diff",
                    "# Null Diff",
                ]
            ].to_string()
            report += "\n\n"

            if sample_count > 0:
                report += "Sample Rows with Unequal Values\n"
                report += "-------------------------------\n"
                report += "\n"
                for sample in match_sample:
                    report += df_to_str(sample)
                    report += "\n\n"

        return report


df1 = read_data(file1)
df2 = read_data(file2)

In [7]:
join_columns = ["index"]

compare = DatalyCompare(
df1,
df2,
join_columns=join_columns, #You can also specify a list of columns
abs_tol=0.0001,
rel_tol=0,
df1_name="original",
df2_name="new")

print(compare.Regression_report())



Dataly Regression Test Output
--------------------

DataFrame Summary
-----------------

  DataFrame  Columns    Rows
0  original       73  211567
1       new       73  211567

Row Summary
-----------

Matched on: index
Any duplicates on match values: Yes
Absolute Tolerance: 0.0001
Relative Tolerance: 0
Number of rows in common: 211,567
Number of rows in original but not in new: 0
Number of rows in new but not in original: 0

Number of rows with some compared columns unequal: 120,522
Number of rows with all compared columns equal: 91,045

Columns with Unequal Values or Types
------------------------------------

              Column original dtype new dtype  # Unequal  Max Diff  # Null Diff
2  b_cover_type_code         object    object      50991       0.0            0
1       benefit_type         object    object      50991       0.0            0
0           category         object    object      89522       0.0            0

Sample Rows with Unequal Values
---------------------------

In [2]:
import pandas as pd
from src.core import compare
df1 = pd.DataFrame()
df2 = pd.DataFrame()
comparison = compare.DatalyCompare(df1, df2, join_columns=['A'])
report = comparison.Regression_report()

ValueError: df1 must have all columns from join_columns

In [7]:
print(report)

Dataly Regression Test Output
--------------------
The report was generated on 2024-06-17 13:33:08

DataFrame Summary
-----------------

  DataFrame  Columns  Rows
0       df1        2     3
1       df2        2     3

Column Summary
--------------

Number of columns in common: 2
Number of columns in df1 but not in df2: 0
Number of columns in df2 but not in df1: 0

Columns in df1 Have all Null values: 
Columns in df2 Have all Null values: 

Row Summary
-----------

Matched on: a
Any duplicates on match values: No
Absolute Tolerance: 0
Relative Tolerance: 0
Number of rows in common: 3
Number of rows in df1 but not in df2: 0
Number of rows in df2 but not in df1: 0

Number of rows with some compared columns unequal: 0
Number of rows with all compared columns equal: 3




In [7]:

join_columns = ["B_Cover_ID"]

compare = Compare(
df1,
df2,
join_columns=join_columns, #You can also specify a list of columns
abs_tol=0.0001,
rel_tol=0,
df1_name="original",
df2_name="new")

print(compare.report())

DataComPy Comparison
--------------------

DataFrame Summary
-----------------

  DataFrame  Columns    Rows
0  original       73  211567
1       new       73   89522

Column Summary
--------------

Number of columns in common: 73
Number of columns in original but not in new: 0
Number of columns in new but not in original: 0

Row Summary
-----------

Matched on: b_cover_id
Any duplicates on match values: No
Absolute Tolerance: 0.0001
Relative Tolerance: 0
Number of rows in common: 89,522
Number of rows in original but not in new: 122,045
Number of rows in new but not in original: 0

Number of rows with some compared columns unequal: 0
Number of rows with all compared columns equal: 89,522

Column Comparison
-----------------

Number of columns compared with some values unequal: 0
Number of columns compared with all values equal: 73
Total number of values which compare unequal: 0

Columns with Unequal Values or Types
------------------------------------

         Column original dtype n