# A whole program to do an analysis of dataframes, complete with tests

## Setup

-- ChatGPT4, browser

# Notes

In [5]:
# -- The prompt here excludes project specific details that are not relevant here and are trivial to code (especially nowadays :) )
# -- I had to ask ChatGPT to continue its output because it had provided an incomplete code fragment to Prompt 1. I just asked 'continue'. 
# -- I had to copy code that fell out of the code window that ChatGPT uses.
# -- I had to edit the generated text to add a closing multi line comment with triple quotes. ChatGPT output starts to pause and choke when it gets very long.

# Purpose

Use ChatGPT to provide a complete, high feature program to solve an actual data analysis task I needed to solve. 

## Background

In [2]:
# I needed to compare one set with another set of multiple CSV files. I needed to see:
# -- If the set of columns were the same
# --   If there were differences, what the differences were
# -- If the order of the columns was different
# -- If the datatype for columns in common were the same
# --   If there were differences, sample values for those differences.

## Non Functional Requirements

In [3]:
# I wanted to describe the full program upfront.
# I wanted to use functions to make the code more readable.
# I wanted to have functions with type hints., because type hints are awesome.
# I wanted to document the functions.

## Prompt 1 -- Create the program

Write a python program to compare the data profile between two pandas dataframes. Look at the column names and provide an analysis of what's not in common. If column name order is different, print out the order of the columns. For columns in common, identify if the column data types are different. When there's a difference between data types, provide 3 sample values from each dataframe column. Breakdown the program into functions with type hints, and document the functions.

## Generated code

In [None]:
import pandas as pd
from typing import List, Tuple

def compare_columns(df1: pd.DataFrame, df2: pd.DataFrame) -> Tuple[List[str], List[str]]:
    """
    Compare columns between two dataframes.

    :param df1: First dataframe
    :param df2: Second dataframe
    :return: A tuple with two lists: the first contains columns not in common, the second contains columns with different order
    """
    not_in_common = list(set(df1.columns) ^ set(df2.columns))
    diff_order = [col for col in df1.columns if df1.columns.get_loc(col) != df2.columns.get_loc(col)]

    return not_in_common, diff_order

def compare_data_types(df1: pd.DataFrame, df2: pd.DataFrame, common_columns: List[str]) -> List[str]:
    """
    Compare data types of common columns between two dataframes.

    :param df1: First dataframe
    :param df2: Second dataframe
    :param common_columns: List of common columns between two dataframes
    :return: A list of columns with different data types
    """
    diff_data_types = [col for col in common_columns if df1[col].dtype != df2[col].dtype]
    return diff_data_types

def get_sample_values(df1: pd.DataFrame, df2: pd.DataFrame, diff_data_types: List[str]) -> None:
    """
    Print 3 sample values from each dataframe column for columns with different data types.

    :param df1: First dataframe
    :param df2: Second dataframe
    :param diff_data_types: List of columns with different data types
    :return: None
    """

    for col in diff_data_types:
        print(f"Column '{col}' has different data types.")
        print(f"Sample values from DataFrame 1: {df1[col].dropna().sample(3).values}")
        print(f"Sample values from DataFrame 2: {df2[col].dropna().sample(3).values}")
        print()

def data_profile_comparison(df1: pd.DataFrame, df2: pd.DataFrame) -> None:
    """
    Compare the data profile between two dataframes.

    :param df1: First dataframe
    :param df2: Second dataframe
    :return: None
    """

    not_in_common, diff_order = compare_columns(df1, df2)

    if not_in_common:
        print(f"Columns not in common: {', '.join(not_in_common)}")
    else:
        print("Both dataframes have the same columns.")

    if diff_order:
        print(f"Columns with different order: {', '.join(diff_order)}")
    else:
        print("Both dataframes have the same column order.")

    common_columns = list(set(df1.columns) & set(df2.columns))
    diff_data_types = compare_data_types(df1, df2, common_columns)

    if diff_data_types:
        get_sample_values(df1, df2, diff_data_types)
    else:
        print("All common columns have the same data types.")
    
# # Example usage:
# data1 = {
# 'A': [1, 2, 3],
# 'B': [4.1, 5.2, 6.3],
# 'C': ['a', 'b', 'c']
# }

# data2 = {
# 'B': [7, 8, 9],
# 'A': [1.1, 2.2, 3.3],
# 'D': ['x', 'y', 'z']
# }

# df1 = pd.DataFrame(data1)
# df2 = pd.DataFrame(data2)

# data_profile_comparison(df1, df2)


## Prompt 2 -- And write some tests for the functions

In [8]:
# Notes:
# -- First time that I executed this, I received a function with some tests. This time, I received a class-based solution.
# -- I've used pytest in the past, but not unittest based tests. Cool. Now I see that works

## Code Generated

In [None]:
import unittest

class TestDataProfileComparison(unittest.TestCase):

    def setUp(self):
        self.data1 = {
            'A': [1, 2, 3],
            'B': [4.1, 5.2, 6.3],
            'C': ['a', 'b', 'c']
        }

        self.data2 = {
            'B': [7, 8, 9],
            'A': [1.1, 2.2, 3.3],
            'D': ['x', 'y', 'z']
        }

        self.df1 = pd.DataFrame(self.data1)
        self.df2 = pd.DataFrame(self.data2)

    def test_compare_columns(self):
        not_in_common, diff_order = compare_columns(self.df1, self.df2)
        self.assertEqual(not_in_common, ['C', 'D'])
        self.assertEqual(diff_order, ['A', 'B'])

    def test_compare_data_types(self):
        common_columns = list(set(self.df1.columns) & set(self.df2.columns))
        diff_data_types = compare_data_types(self.df1, self.df2, common_columns)
        self.assertEqual(diff_data_types, ['A', 'B'])

    def test_get_sample_values(self):
        common_columns = list(set(self.df1.columns) & set(self.df2.columns))
        diff_data_types = compare_data_types(self.df1, self.df2, common_columns)
        get_sample_values(self.df1, self.df2, diff_data_types)  # This test is only to check if the function executes without errors

    def test_data_profile_comparison(self):
        data_profile_comparison(self.df1, self.df2)  # This test is only to check if the function executes without errors

if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)


### I don't like this function testing solution. I much preferred working off an earlier one. I'll add the results from the earlier assay at a later date.

In [11]:
# Observations/Recommendations:
# -- You can create a whole solution from an initial set of asks.
# -- You can ask for tests to exercise the solution.
# -- You can get wildly different solution based on the same prompt.
# --   So, good idea to save off the solution that you generate and like
# -- Modify the prompt to get different solutions, some with better solutions, and explore the universe of solutions.