In [None]:
import os
import pandas as pd
import langchain
from langchain.agents import OpenAIFunctionsAgent, AgentExecutor
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.tools import PythonAstREPLTool
from langchain.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
from langchain.memory import ConversationBufferMemory
from langchain.schema.output_parser import StrOutputParser
import json
import gradio as gr
from dotenv import load_dotenv
load_dotenv()

In [35]:
langchain.debug = True
data_dir_path = os.path.join(os.getcwd())
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 20)
NUM_ROWS_TO_RETURN = 5

In [93]:
def get_data_str_from_df_for_prompt(df, use_head=True, num_rows_to_return=NUM_ROWS_TO_RETURN):
    data = df.head(num_rows_to_return) if use_head else df.tail(num_rows_to_return)
    return f'<df>\n{data.to_markdown()}\n</df>'

entries_a_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_a.csv'))
entries_b_df = pd.read_csv(os.path.join(data_dir_path, 'legal_entries_b.csv'))
template_df = pd.read_csv(os.path.join(data_dir_path, 'legal_template.csv'))

In [30]:
analyst_prompt_str = '''
    You are a Junior Data Analyst. You are working with Data Scientists and Software Engineers to transform data from one source format to a target format.
    Here is the head of a CSV file, called source_1_csv:
    
    {source_1_csv_str}
    
    Here is the head of another, called target_csv:
    
    {target_csv_str}
    
    Your job is to generate a thorough, precise summary of exactly which columns in the source file should map to the target file, including a brief explanation for each.
    In addition, if the cell values appear to be different, you should note that as well. For example, if the source date is formatted like this 2020-01-01 and the target date is formatted like this 01/01/2020, you should note that.
    Your output should be natural language; a bulleted list with four values: 1. source column name, 2. target column name, 3. value transformations (if necessary), 4. explanation.
    Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.
    Do not include rows: focus on the column names and data types.
    DO NOT INCLUDE COMMAS OR QUOTES ANYWHERE THIS WILL BE USED IN CSV GENERATION.
    Instead of quotes, use backticks.
    Your response:
'''

scientist_prompt_str = '''
    You are a Senior Data Scientist. You are generating a CSV that will be used by a Software Engineer to generate python code to transform one CSV file format to another.
    
    Here is the tail of a CSV file, called source_1_csv: 
    
    {source_1_csv_str}
    
    Here are five from another, called target_csv: 
    
    {target_csv_str}
    
    Your employee wrote a document with the mappings: 
    
    {employee_script_str}
    
    Please review their work, think step by step about their proposal, and then revise it with improvements.
    Note: your employee had access to different rows than you did. Neither one of you had access to the entire dataset.
    Your final output should be a CSV with these columns: source_column_name, target_column_name, value_transformation, notes.
    Be as precise as possible. All of these values are case sensitive. Do not do any transformation or imputation yourself: your job is to explain the data, not change it.
    Do not include any commas or quotes in the output, except for the CSV delimiters.
    Instead of quotes, use backticks.
    Your response:
'''

python_prompt_str = '''
    You are a Python Engineer. Your job is to write code to convert a csv-like string into a Pandas dataframe:

    {csv_str}

    You might have to transform the data slightly to get it to work. But do not do any data type transformation. For example, the date strings are strings: do not conver them to data type.
    The int are int, do not convert them to float, etc.

    Your output should be Pandas DataFrame.
'''

csv_convert_prompt_str = '''
    Convert the following csv-like string into valid json:
    {csv_str}
    Steps: 1. infer schema, 2. remove newlines or any other characters that are not json-serializable, 3. convert to valid json, 4. double check your work and make revisions as needed. 
    Your output should be 100% valid json. Do not transform the values, just the structure.
    '''

python_prompt_str = '''
You are a Python Engineer. Your job is to write code to convert JSON string into a Pandas dataframe:

{csv_str}

Do not transform the data at all, just convert it to a csv format and then generate a pandas dataframe.

Return only python code in Markdown format, eg:

```python
....
```"""
'''
from langchain.utilities import PythonREPL

In [25]:
from langchain.agents.agent_toolkits import create_python_agent
from langchain.tools.python.tool import PythonREPLTool
from langchain.python import PythonREPL
from langchain.llms.openai import OpenAI
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
import io

agent_executor = create_python_agent(
    llm=ChatOpenAI(temperature=0, model="gpt-4-0613"),
    tool=PythonREPLTool(),
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    agent_executor_kwargs={"handle_parsing_errors": True},
)

def _sanitize_output(text: str):
    _, after = text.split("```python")
    return after.split("```")[0]

def run_code(thing):
    model = ChatOpenAI(model_name='gpt-3.5-turbo')
    analyst_prompt = ChatPromptTemplate.from_template(analyst_prompt_str)
    scientist_prompt = ChatPromptTemplate.from_template(scientist_prompt_str)

    chain_1 = analyst_prompt | model | StrOutputParser()
    employee_script_str = chain_1.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(entries_a_df), "target_csv_str": get_data_str_from_df_for_prompt(template_df)})

    chain_2 = scientist_prompt | model | StrOutputParser()
    scientist_csv_str = chain_2.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(entries_a_df, False), "target_csv_str": get_data_str_from_df_for_prompt(template_df, False), "employee_script_str": employee_script_str})
    csv_prompt = ChatPromptTemplate.from_template(csv_convert_prompt_str)
    chain_3 = csv_prompt| model | StrOutputParser()
    # python_prompt = ChatPromptTemplate.from_template(python_prompt_str)
    # chain_4 = {"csv_string": chain_3} | python_prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run
    # csv_bytes = io.BytesIO(scientist_csv_str.encode())
    return pd.read_csv(json.loads(chain_3.invoke({"csv_str": scientist_csv_str})))
    # return chain_3.invoke({"csv_str": scientist_csv_str})
    # return agent_executor.run(f"Please convert this csv-like string into a Pandas Dataframe: {scientist_csv_str}. Do not transform the data: it should have the same column names and types. Do not convert date strings to datetimes or ints to floats. Return one dataframe and nothing else.")

In [26]:
demo = gr.Interface(fn=run_code, inputs=["file"], outputs='dataframe')
demo.launch()

Running on local URL:  http://127.0.0.1:7871

To create a public link, set `share=True` in `launch()`.




[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "source_1_csv_str": "<df>\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\n</df>",
  "target_csv_str": "<df>\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\n|---:|:--

Traceback (most recent call last):
  File "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/routes.py", line 488, in run_predict
    output = await app.get_blocks().process_api(
  File "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/blocks.py", line 1431, in process_api
    result = await self.call_function(
  File "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/gradio/blocks.py", line 1109, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/Users/andybryant/Desktop/projects/zero-mapper/venv/lib/python3.9/site-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/Users/andybryant/Desktop/projects/zero-mapper/ven

In [19]:
scientist_csv_str = 'source_column_name, target_column_name, value_transformation, notes\ncase_date, CaseDate, None, The source column "case_date" maps directly to the target column "CaseDate" without any value transformations. The data represents the date of the case in both files.\nlastname, FullName, None, The source column "lastname" maps directly to the target column "FullName" without any value transformations. The data represents the last name of the individual involved in the case in both files.\nfirstname, FullName, None, The source column "firstname" maps directly to the target column "FullName" without any value transformations. The data represents the first name of the individual involved in the case in both files.\ncase_type, CaseType, None, The source column "case_type" maps directly to the target column "CaseType" without any value transformations. The data represents the type of the case in both files.\ncase_id, CaseID, None, The source column "case_id" maps directly to the target column "CaseID" without any value transformations. The data represents the unique identifier of the case in both files.\ncourt_fee, Fee, None, The source column "court_fee" maps directly to the target column "Fee" without any value transformations. The data represents the fee associated with the case in both files.\njurisdiction, Jurisdiction, "BOSTON" -> "Boston", "houston" -> "Houston", "chicago" -> "Chicago", The source column "jurisdiction" maps directly to the target column "Jurisdiction" with the following value transformations: "BOSTON" is transformed to "Boston", "houston" is transformed to "Houston", and "chicago" is transformed to "Chicago". The data represents the jurisdiction where the case is filed, and the capitalization is adjusted to match the formatting in the target file.'

In [21]:
def process_files(files):
    # Process files here    
    return scientist_csv_str

iface = gr.Interface(fn=process_files, 
                     inputs=[gr.File(label="Load files", file_types=['csv'], file_count='multiple')], 
                     outputs=gr.Markdown(interactive=True)) 

iface.launch()

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.




In [31]:
model = ChatOpenAI(model_name='gpt-3.5-turbo')
analyst_prompt = ChatPromptTemplate.from_template(analyst_prompt_str)
scientist_prompt = ChatPromptTemplate.from_template(scientist_prompt_str)

chain_1 = analyst_prompt | model | StrOutputParser()
employee_script_str = chain_1.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(entries_a_df), "target_csv_str": get_data_str_from_df_for_prompt(template_df)})

chain_2 = scientist_prompt | model | StrOutputParser()
scientist_csv_str = chain_2.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(entries_a_df, False), "target_csv_str": get_data_str_from_df_for_prompt(template_df, False), "employee_script_str": employee_script_str})
csv_prompt = ChatPromptTemplate.from_template(csv_convert_prompt_str)
chain_3 = csv_prompt| model | StrOutputParser()
# python_prompt = ChatPromptTemplate.from_template(python_prompt_str)
# chain_4 = {"csv_string": chain_3} | python_prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run
# csv_bytes = io.BytesIO(scientist_csv_str.encode())
res = chain_3.invoke({"csv_str": scientist_csv_str})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "source_1_csv_str": "<df>\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\n</df>",
  "target_csv_str": "<df>\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\n|---:|:--

In [32]:
res

'[\n    {\n        "source_column_name": "case_date",\n        "target_column_name": "CaseDate",\n        "value_transformation": "None",\n        "notes": "The source column `case_date` should map to the target column `CaseDate` as it represents the date of the case. No value transformations are required."\n    },\n    {\n        "source_column_name": "lastname",\n        "target_column_name": "FullName",\n        "value_transformation": "None",\n        "notes": "The source column `lastname` should map to the target column `FullName` as it represents the last name of the individual involved in the case. No value transformations are required."\n    },\n    {\n        "source_column_name": "firstname",\n        "target_column_name": "FullName",\n        "value_transformation": "None",\n        "notes": "The source column `firstname` should also map to the target column `FullName` as it represents the first name of the individual involved in the case. No value transformations are requir

In [70]:
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, validator
from langchain.prompts import (
    ChatPromptTemplate,
)
from pydantic import BaseModel, Field, validator, ValidationError

class TableMappingEntry(BaseModel):
    source_column_name: str = Field(..., description="Name of the column in the source table.")
    target_column_name: str = Field(..., description="Name of the column in the target table.")
    value_transformations: str = Field(None, description="The transformations that need to be applied to the source values to make it match the target values, if necessary.")
    explanation: str = Field(None, description="A brief explanation of why the source column maps to the target column. Include any other relevant information.")

    @validator("source_column_name", "target_column_name", "value_transformations", "explanation", pre=True, always=True)
    def check_forbidden_characters(cls, value):
        if value:
            forbidden_chars = [",", "\"", "'"]
            for char in forbidden_chars:
                if char in value:
                    raise ValueError(f"Field contains forbidden character: {char}")
        return value

class TableMapping(BaseModel):
    table_mappings: list[TableMappingEntry] = Field(..., description="A list of table mappings.")

    @validator('table_mappings', pre=True, always=True)
    def check_unique_source_and_target_names(cls, value):
        source_columns = [entry.source_column_name for entry in value]
        target_columns = [entry.target_column_name for entry in value]
        
        if len(source_columns) != len(set(source_columns)):
            raise ValueError("Duplicate source column names detected.")
        if len(target_columns) != len(set(target_columns)):
            raise ValueError("Duplicate target column names detected.")
        
        return value

# Test the models
try:
    entry = TableMappingEntry(source_column_name="FirstName,LastName", target_column_name="FullName")
except ValidationError as e:
    print(e)
    
analyst_prompt_str = '''
    You are a Data Scientist, who specializes in generating mappings between schemas for use by Software Engineers in ETL pipelines.
    Head of `source_csv`:
    
    {source_1_csv_str}
    
    Head of `target_csv`:
    
    {target_csv_str}
    
    Your job is to generate a thorough, precise summary of exactly which columns in `source_csv` map to columns in `target_csv`.
    In addition, compare cell values and note any formatting differences. For example:
    - If date in `source_csv` is `2020-01-01` and date in `target_csv` is `01/01/2020`, you should mention that it's the same date, but structured differently.
    - If `source_csv` has columns "FirstName" and "LastName", but `target_csv` only has "FullName", you should include both of the first columns as entries, but also explain how they should be transform to adhere to the target column.

    Lastly, point out any other oddities, such as duplicate columns, erroneous columns, etc.
    
    {format_instructions}
    
    Remember:
    - Be concise: you are speaking to engineers, not customers.
    - Be precise: all of these values are case sensitive. 
    - DO NOT transform or impute values: your job is to explain the data, not change it.
    - DO NOT include commas, quotes, or any other characters that might interfere with JSON serialization or CSV generation

    Your response:
'''

table_mapping_parser = PydanticOutputParser(pydantic_object=TableMapping)
analyst_prompt = ChatPromptTemplate.from_template(
    template=analyst_prompt_str, 
    partial_variables={'format_instructions': table_mapping_parser.get_format_instructions()},
)

In [69]:
chain_1 = analyst_prompt | model | table_mapping_parser
table_mapping: TableMapping = chain_1.invoke({"source_1_csv_str": get_data_str_from_df_for_prompt(entries_a_df), "target_csv_str": get_data_str_from_df_for_prompt(template_df)})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "source_1_csv_str": "<df>\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\n</df>",
  "target_csv_str": "<df>\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\n|---:|:--

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "source_1_csv_str": "<df>\n|    | case_date   | lastname   | firstname   | case_type   | case_id   |   court_fee | jurisdiction   |\n|---:|:------------|:-----------|:------------|:------------|:----------|------------:|:---------------|\n|  0 | 2023-05-12  | Kim        | Miguel      | Civil       | CR-1095   |         100 | BOSTON         |\n|  1 | 2023-04-20  | Lee        | John        | Criminl     | CR-8597   |         150 | houston        |\n|  2 | 2023-02-10  | Smith      | Dmitri      | Criminal    | CR-6833   |         200 | chicago        |\n|  3 | 2023-03-16  | Patel      | Dmitri      | Criminal    | CR-2899   |         100 | BOSTON         |\n|  4 | 2023-06-15  | Ivanov     | Jane        | Family      | CR-5997   |         200 | houston        |\n</df>",
  "target_csv_str": "<df>\n|    | CaseDate   | FullName     | CaseType   | CaseID    |   Fee | Jurisdiction   |\n|---:|:--

In [71]:
employee_script_str.dict()

{'table_mappings': [{'source_column_name': 'case_date',
   'target_column_name': 'CaseDate',
   'value_transformations': '',
   'explanation': 'The `case_date` column in the source file maps to the `CaseDate` column in the target file. The values in both columns represent the date of the case. No value transformations are needed.'},
  {'source_column_name': 'lastname',
   'target_column_name': 'FullName',
   'value_transformations': '',
   'explanation': 'The `lastname` column in the source file maps to the `FullName` column in the target file. The values in both columns represent the last name of the person associated with the case. No value transformations are needed.'},
  {'source_column_name': 'firstname',
   'target_column_name': 'FullName',
   'value_transformations': '',
   'explanation': 'The `firstname` column in the source file also maps to the `FullName` column in the target file. The values in both columns represent the first name of the person associated with the case. No 

In [102]:
from langchain.utilities import PythonREPL
from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate

template = """Write some python code to solve the user's problem. 

Return only python code in Markdown format, eg:

```python
....
```"""
python_prompt = ChatPromptTemplate(messages=[
    SystemMessagePromptTemplate.from_template(template),
    HumanMessagePromptTemplate.from_template("{input}")
])

# Input is sanitized by default
python_chain = python_prompt | model | StrOutputParser()

In [103]:
question = f'''
    You are an expert Python Engineer, working on an ETL pipeline. Your colleague has written a pydantic object that describes the mapping between a source table and a target table.
    This source table data is now in a Pandas DataFrame called `source_df`. Your job is to write valid python code to copy this Pandas DataFrame and transform it to the target schema.
    Here is a stringified version of the mapping object:
    
    {str(table_mapping)}

    Your code should do the following:
    - Review the mapping object and understand the transformation logic
    - Copy the source_df to a new DataFrame called `target_df`
    - DO NOT MODIFY the `source_df`
    - Transform the `target_df` to the target schema
    - Validate that it matches the target schema
    - Return the `target_df`

    Once executed, this code must return a Pandas DataFrame that matches the target schema.

    Return only python code in Markdown format, eg:
    ```python
    ....
    ```"""
'''
res = python_chain.invoke({"input": question})

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "\n    You are an expert Python Engineer, working on an ETL pipeline. Your colleague has written a pydantic object that describes the mapping between a source table and a target table.\n    This source table data is now in a Pandas DataFrame called `source_df`. Your job is to write valid python code to copy this Pandas DataFrame and transform it to the target schema.\n    Here is a stringified version of the mapping object:\n    \n    table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\"The source column 'lastname' maps to the target column 'FullName' without any value tran

In [104]:
res

'```python\nimport pandas as pd\n\n# Define the mapping object\nclass TableMappingEntry:\n    def __init__(self, source_column_name, target_column_name, value_transformations, explanation):\n        self.source_column_name = source_column_name\n        self.target_column_name = target_column_name\n        self.value_transformations = value_transformations\n        self.explanation = explanation\n\n# Define the source DataFrame\nsource_df = pd.DataFrame({\n    \'case_date\': [\'2021-01-01\', \'2021-01-02\'],\n    \'lastname\': [\'Doe\', \'Smith\'],\n    \'firstname\': [\'John\', \'Jane\'],\n    \'case_type\': [\'Criminal\', \'Civil\'],\n    \'case_id\': [1, 2],\n    \'court_fee\': [100, 200],\n    \'jurisdiction\': [\'State\', \'Federal\']\n})\n\n# Define the target schema\ntarget_df_schema = pd.DataFrame({\n    \'CaseDate\': pd.Series(dtype=\'datetime64[ns]\'),\n    \'FullName\': pd.Series(dtype=\'object\'),\n    \'CaseType\': pd.Series(dtype=\'object\'),\n    \'CaseID\': pd.Series(dty

In [90]:
from langchain.agents.agent_toolkits import create_python_agent
from langchain.tools.python.tool import PythonREPLTool
from langchain.python import PythonREPL
from langchain.llms.openai import OpenAI
from langchain.agents.agent_types import AgentType
from langchain.chat_models import ChatOpenAI
import io
from langchain.agents import create_pandas_dataframe_agent

agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model="gpt-4-0613"),
    # ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    entries_a_df,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
    handle_parsing_errors=True
)

question = f'''
    df is the source data. Here is the mapping between the source schema and the target schema:
    {str(table_mapping)}
    Transform df to the target schema.
    Do not query the whole df. Use the head, generate the logic, apply it, validate the results.
'''

res = agent.run(input=question, verbose=True)

[32;1m[1;3m[chain/start][0m [1m[1:chain:AgentExecutor] Entering Chain run with input:
[0m{
  "input": "\n    df is the source data. Here is the mapping between the source schema and the target schema:\n    table_mappings=[TableMappingEntry(source_column_name='case_date', target_column_name='CaseDate', value_transformations='', explanation=\"The source column 'case_date' maps to the target column 'CaseDate' without any value transformations.\"), TableMappingEntry(source_column_name='lastname', target_column_name='FullName', value_transformations='', explanation=\"The source column 'lastname' maps to the target column 'FullName' without any value transformations.\"), TableMappingEntry(source_column_name='firstname', target_column_name='', value_transformations='', explanation=\"The source column 'firstname' does not have a direct mapping in the target schema. The values from 'firstname' need to be combined with 'lastname' to form 'FullName' in the target schema.\"), TableMappingEntr

In [94]:
entries_a_df

Unnamed: 0,case_date,lastname,firstname,case_type,case_id,court_fee,jurisdiction
0,2023-05-12,Kim,Miguel,Civil,CR-1095,100,BOSTON
1,2023-04-20,Lee,John,Criminl,CR-8597,150,houston
2,2023-02-10,Smith,Dmitri,Criminal,CR-6833,200,chicago
3,2023-03-16,Patel,Dmitri,Criminal,CR-2899,100,BOSTON
4,2023-06-15,Ivanov,Jane,Family,CR-5997,200,houston
...,...,...,...,...,...,...,...
95,2023-08-13,Rodriguez,Jane,Familly,CR-8883,200,chicago
96,2023-03-27,Patel,John,Familly,CR-2838,100,BOSTON
97,2023-07-27,Okafor,Miguel,Criminal,CR-3885,250,new York
98,2023-01-14,Ivanov,Alan,Family,CR-1066,250,houston
