In [81]:
# load raw data and cleaned data
import pandas as pd
import numpy as np
import re
import swifter
import warnings
warnings.filterwarnings("ignore")

raw_data = pd.read_excel("../raw data/teak.xlsx")
cleaned_data = pd.read_csv("../cleaned data/teak.csv")

In [134]:
def count_diff(old, new, columns):
  old_df = old.copy().replace('', np.nan).fillna('nan')
  new_df = new.copy().replace('', np.nan).fillna('nan')
  correct = 0
  for col in columns:
    if col not in old_df.columns:
      continue
    for i in range(len(new_df[col])):
      if new_df[col][i] == old_df[col][i]:
        correct += 1
  return len(old_df)*len(columns) - correct

In [83]:
cleaned_data.columns

Index(['Bill of Lading Number', 'Arrival Date', 'Consignee',
       'Consignee Country', 'Consignee Full Address', 'Consignee Profile',
       'Shipper', 'Shipper Country', 'Shipper Full Address',
       'Shipper Trade Roles', 'Shipper (Original Format)', 'Carrier',
       'Shipment Origin', 'Shipment Destination', 'Port of Unlading',
       'Port of Lading', 'Vessel', 'Vessel IMO', 'Volume (TEU)', 'Quantity',
       'Weight (kg)', 'Manifest Number', 'Container Numbers', 'HS Code',
       'Goods Shipped', 'HS4'],
      dtype='object')

In [84]:
# num_cols: columns that should be numerical -> check for incorrect format
num_cols = ['Bill of Lading Number', 'Arrival Date', 'Vessel IMO', 'Volume (TEU)', 'Quantity', 'Weight (kg)', 'Manifest Number', 'Container Numbers', 'HS Code', 'HS4']

# str_cols: columns that should be string -> check for misspellings
str_cols = list(cleaned_data.columns)
for col in num_cols:
  str_cols.remove(col)

In [96]:
print(f"In raw data:")
print(f"{len(list(raw_data.columns))} columns, {len(raw_data)} rows")
print(f"{100*raw_data.isna().sum().sum()/(len(raw_data)*len(raw_data.columns))} % of missing values.")
print(f"{100*count_diff(raw_data, cleaned_data, num_cols)/(len(raw_data)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(raw_data, cleaned_data, str_cols)/(len(raw_data)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(raw_data, cleaned_data, list(cleaned_data.columns))/(len(raw_data)*len(list(cleaned_data.columns)))} % of correct values.")

In raw data:
127 columns, 69134 rows
48.769774731669116 % of missing values.
19.98900685625018 % of incorrect formats.
9.288483235455782 % of misspellings.

86.59593075654638 % of correct values.


In [97]:
print(f"In cleaned data:")
print(f"{len(cleaned_data.columns)} columns, {len(cleaned_data)} rows")
print(f"{100*cleaned_data.isna().sum().sum()/(len(cleaned_data)*len(cleaned_data.columns))} % of missing values.")

In cleaned data:
26 columns, 69134 rows
7.572306624147975 % of missing values.


In [87]:
import datetime
import dateutil
from gensim.models import KeyedVectors
import nltk
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
import langdetect

## TradeSleuth

In [98]:
df = pd.read_csv("../tradesleuth/teak.csv")

In [99]:
print("\n===================== Final Version Code =====================\n")

print(f"\nColumns dropped correctly: {list(df.columns)==list(cleaned_data.columns)}")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['Arrival Date'])/len(raw_data)} % of incorrect dates.")
print(f"Clean Dates: {100*count_diff(df, cleaned_data, ['Arrival Date'])/len(df)} % of incorrect dates.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['HS Code'])/len(raw_data)} % of incorrect HS Codes.")
print(f"Clean HS Codes: {100*count_diff(df, cleaned_data, ['HS Code'])/len(df)} % of incorrect HS Codes.")

print(f"\nAdd HS4 column: {100*count_diff(df, cleaned_data, ['HS4'])/len(df)} % of incorrect values.")

print(f"\n# of missing values in Consignee before filling: {raw_data['Consignee'].isna().sum()}")
print(f"# of missing values in Consignee after filling: {df['Consignee'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Consignee'])/len(raw_data)} % of misspelled Consignee.")
print(f"Correct misspelling in Consignee : {100*count_diff(df, cleaned_data, ['Consignee'])/len(df)} % of misspelled Consignee.")

print(f"\n# of missing values in Consignee Country before filling: {raw_data['Consignee Country'].isna().sum()}")
print(f"# of missing values in Consignee after filling: {df['Consignee Country'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Consignee Country'])/len(raw_data)} % of misspelled Consignee Country.")
print(f"Correct misspelling in Consignee Country: {100*count_diff(df, cleaned_data, ['Consignee Country'])/len(df)} % of misspelled Consignee Country.")

print(f"\n# of missing values in Shipper before filling: {raw_data['Shipper'].isna().sum()}")
print(f"# of missing values in Shipper after filling: {df['Shipper'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Shipper'])/len(raw_data)} % of misspelled Shipper.")
print(f"Correct misspelling in Shipper : {100*count_diff(df, cleaned_data, ['Shipper'])/len(df)} % of misspelled Shipper.")

print(f"\n# of missing values in Shipper Country before filling: {raw_data['Shipper Country'].isna().sum()}")
print(f"# of missing values in Shipper Country after filling: {df['Shipper Country'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Shipper Country'])/len(raw_data)} % of misspelled Shipper Country.")
print(f"Correct misspelling in Shipper Country: {100*count_diff(df, cleaned_data, ['Shipper Country'])/len(df)} % of misspelled Shipper Country.")




Columns dropped correctly: True

Before cleaning: 0.0 % of incorrect dates.
Clean Dates: 0.0 % of incorrect dates.

Before cleaning: 99.89006856250181 % of incorrect HS Codes.
Clean HS Codes: 4.72415887985651 % of incorrect HS Codes.

Add HS4 column: 5.564555790204531 % of incorrect values.

# of missing values in Consignee before filling: 14787
# of missing values in Consignee after filling: 0

Before cleaning, 33.00257470998351 % of misspelled Consignee.
Correct misspelling in Consignee : 22.329100008678797 % of misspelled Consignee.

# of missing values in Consignee Country before filling: 20939
# of missing values in Consignee after filling: 0

Before cleaning, 30.578297219891805 % of misspelled Consignee Country.
Correct misspelling in Consignee Country: 6.4772760146960975 % of misspelled Consignee Country.

# of missing values in Shipper before filling: 19933
# of missing values in Shipper after filling: 0

Before cleaning, 39.0820724968901 % of misspelled Shipper.
Correct mis

In [100]:
print(f"Clean data by TradeSleuth:\n")
print(f"{len(list(df.columns))} columns, {len(df)} rows")
print(f"{100*df.isna().sum().sum()/(len(df)*len(df.columns))} % of missing values.")
print(f"{100*count_diff(df, cleaned_data, num_cols)/(len(df)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(df, cleaned_data, str_cols)/(len(df)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(df, cleaned_data, list(cleaned_data.columns))/(len(df)*len(cleaned_data.columns))} % of correct values.")

Time = [159.78045763168484, 135.25817073043436, 89.18256803043187, 240.36319252476096, 136.9293821733445, 244.18257671501487, 65.64140693936497, 120.78819172643125, 68.4884757520631, 464.43246026337147, 142.7786153415218, 142.7786153415218]

Iteration = [1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1]

total_time = np.sum(Time)
total_iteration = np.sum(Iteration)
first_time_cnt = 0
for i in Iteration:
  if Iteration[i] == 1:
    first_time_cnt += 1
print(f"\nAverage time to generate correct code: {total_time/len(Time)} s.")
print(f"Average # of revises to generate correct code: {total_iteration/len(Iteration)}.")
print(f"Accepted first-time codes: {first_time_cnt}/{len(Iteration)}.")

Clean data by TradeSleuth:

26 columns, 69134 rows
7.093860084429124 % of missing values.
1.028871467006104 % of incorrect formats.
3.916488269158446 % of misspellings.

97.19413357782322 % of correct values.

Average time to generate correct code: 167.55034276416214 s.
Average # of revises to generate correct code: 1.1666666666666667.
Accepted first-time codes: 10/12.


## Baseline 1

In [107]:
df = pd.read_csv("../baseline_1/teak.csv")

In [108]:
print("\n===================== Final Version Code =====================\n")

print(f"\nColumns dropped correctly: {list(df.columns)==list(cleaned_data.columns)}")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['Arrival Date'])/len(raw_data)} % of incorrect dates.")
print(f"Clean Dates: {100*count_diff(df, cleaned_data, ['Arrival Date'])/len(df)} % of incorrect dates.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['HS Code'])/len(raw_data)} % of incorrect HS Codes.")
print(f"Clean HS Codes: {100*count_diff(df, cleaned_data, ['HS Code'])/len(df)} % of incorrect HS Codes.")

print(f"\nAdd HS4 column: {100*count_diff(df, cleaned_data, ['HS4'])/len(df)} % of incorrect values.")

print(f"\n# of missing values in Consignee before filling: {raw_data['Consignee'].isna().sum()}")
print(f"# of missing values in Consignee after filling: {df['Consignee'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Consignee'])/len(raw_data)} % of misspelled Consignee.")
print(f"Correct misspelling in Consignee : {100*count_diff(df, cleaned_data, ['Consignee'])/len(df)} % of misspelled Consignee.")

print(f"\n# of missing values in Consignee Country before filling: {raw_data['Consignee Country'].isna().sum()}")
print(f"# of missing values in Consignee after filling: {df['Consignee Country'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Consignee Country'])/len(raw_data)} % of misspelled Consignee Country.")
print(f"Correct misspelling in Consignee Country: {100*count_diff(df, cleaned_data, ['Consignee Country'])/len(df)} % of misspelled Consignee Country.")

print(f"\n# of missing values in Shipper before filling: {raw_data['Shipper'].isna().sum()}")
print(f"# of missing values in Shipper after filling: {df['Shipper'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Shipper'])/len(raw_data)} % of misspelled Shipper.")
print(f"Correct misspelling in Shipper : {100*count_diff(df, cleaned_data, ['Shipper'])/len(df)} % of misspelled Shipper.")

print(f"\n# of missing values in Shipper Country before filling: {raw_data['Shipper Country'].isna().sum()}")
print(f"# of missing values in Shipper Country after filling: {df['Shipper Country'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Shipper Country'])/len(raw_data)} % of misspelled Shipper Country.")
print(f"Correct misspelling in Shipper Country: {100*count_diff(df, cleaned_data, ['Shipper Country'])/len(df)} % of misspelled Shipper Country.")




Columns dropped correctly: True

Before cleaning: 0.0 % of incorrect dates.
Clean Dates: 100.0 % of incorrect dates.

Before cleaning: 99.89006856250181 % of incorrect HS Codes.
Clean HS Codes: 99.89006856250181 % of incorrect HS Codes.

Add HS4 column: 99.89006856250181 % of incorrect values.

# of missing values in Consignee before filling: 14787
# of missing values in Consignee after filling: 0

Before cleaning, 33.00257470998351 % of misspelled Consignee.
Correct misspelling in Consignee : 100.0 % of misspelled Consignee.

# of missing values in Consignee Country before filling: 20939
# of missing values in Consignee after filling: 0

Before cleaning, 30.578297219891805 % of misspelled Consignee Country.
Correct misspelling in Consignee Country: 100.0 % of misspelled Consignee Country.

# of missing values in Shipper before filling: 19933
# of missing values in Shipper after filling: 0

Before cleaning, 39.0820724968901 % of misspelled Shipper.
Correct misspelling in Shipper : 1

In [109]:
print(f"Clean data by baseline-1:\n")
print(f"{len(list(df.columns))} columns, {len(df)} rows")
print(f"{100*df.isna().sum().sum()/(len(df)*len(df.columns))} % of missing values.")
print(f"{100*count_diff(df, cleaned_data, num_cols)/(len(df)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(df, cleaned_data, str_cols)/(len(df)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(df, cleaned_data, list(cleaned_data.columns))/(len(df)*len(cleaned_data.columns))} % of correct values.")

Time = [104.81041465420276, 87.70626959111542, 232.86481264792383, 179.2158875199966, 179.77272921567783, 193.6706806346774, 242.82476404868066, 9.17406000988558, 99.5148379993625, 93.98358555557206, 99.45217250939459, 93.60977039812133]
Iteration = [1, 1, 6, 2, 2, 2, 2, 1, 2, 1, 2, 1]

total_time = np.sum(Time)
total_iteration = np.sum(Iteration)
first_time_cnt = 0
for i in Iteration:
  if Iteration[i] == 1:
    first_time_cnt += 1
print(f"\nAverage time to generate correct code: {total_time/len(Time)} s.")
print(f"Average # of revises to generate correct code: {total_iteration/len(Iteration)}.")
print(f"Accepted first-time codes: {first_time_cnt}/{len(Iteration)}.")


Clean data by baseline-1:

26 columns, 69134 rows
10.940013930582971 % of missing values.
29.97801371250036 % of incorrect formats.
25.0 % of misspellings.

73.08537934134601 % of correct values.

Average time to generate correct code: 134.71666539871754 s.
Average # of revises to generate correct code: 1.9166666666666667.
Accepted first-time codes: 5/12.


## Baseline 2

In [103]:
df = pd.read_csv("../baseline_2/teak.csv")

In [104]:
print("\n===================== Final Version Code =====================\n")

print(f"\nColumns dropped correctly: {list(df.columns)==list(cleaned_data.columns)}")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['Arrival Date'])/len(raw_data)} % of incorrect dates.")
print(f"Clean Dates: {100*count_diff(df, cleaned_data, ['Arrival Date'])/len(df)} % of incorrect dates.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['HS Code'])/len(raw_data)} % of incorrect HS Codes.")
print(f"Clean HS Codes: {100*count_diff(df, cleaned_data, ['HS Code'])/len(df)} % of incorrect HS Codes.")

print(f"\nAdd HS4 column: {100*count_diff(df, cleaned_data, ['HS4'])/len(df)} % of incorrect values.")

print(f"\n# of missing values in Consignee before filling: {raw_data['Consignee'].isna().sum()}")
print(f"# of missing values in Consignee after filling: {df['Consignee'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Consignee'])/len(raw_data)} % of misspelled Consignee.")
print(f"Correct misspelling in Consignee : {100*count_diff(df, cleaned_data, ['Consignee'])/len(df)} % of misspelled Consignee.")

print(f"\n# of missing values in Consignee Country before filling: {raw_data['Consignee Country'].isna().sum()}")
print(f"# of missing values in Consignee after filling: {df['Consignee Country'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Consignee Country'])/len(raw_data)} % of misspelled Consignee Country.")
print(f"Correct misspelling in Consignee Country: {100*count_diff(df, cleaned_data, ['Consignee Country'])/len(df)} % of misspelled Consignee Country.")

print(f"\n# of missing values in Shipper before filling: {raw_data['Shipper'].isna().sum()}")
print(f"# of missing values in Shipper after filling: {df['Shipper'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Shipper'])/len(raw_data)} % of misspelled Shipper.")
print(f"Correct misspelling in Shipper : {100*count_diff(df, cleaned_data, ['Shipper'])/len(df)} % of misspelled Shipper.")

print(f"\n# of missing values in Shipper Country before filling: {raw_data['Shipper Country'].isna().sum()}")
print(f"# of missing values in Shipper Country after filling: {df['Shipper Country'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Shipper Country'])/len(raw_data)} % of misspelled Shipper Country.")
print(f"Correct misspelling in Shipper Country: {100*count_diff(df, cleaned_data, ['Shipper Country'])/len(df)} % of misspelled Shipper Country.")




Columns dropped correctly: True

Before cleaning: 0.0 % of incorrect dates.
Clean Dates: 0.0 % of incorrect dates.

Before cleaning: 99.89006856250181 % of incorrect HS Codes.
Clean HS Codes: 4.883270170972315 % of incorrect HS Codes.

Add HS4 column: 93.67026354615673 % of incorrect values.

# of missing values in Consignee before filling: 14787
# of missing values in Consignee after filling: 0

Before cleaning, 33.00257470998351 % of misspelled Consignee.
Correct misspelling in Consignee : 22.329100008678797 % of misspelled Consignee.

# of missing values in Consignee Country before filling: 20939
# of missing values in Consignee after filling: 0

Before cleaning, 30.578297219891805 % of misspelled Consignee Country.
Correct misspelling in Consignee Country: 6.4772760146960975 % of misspelled Consignee Country.

# of missing values in Shipper before filling: 19933
# of missing values in Shipper after filling: 0

Before cleaning, 39.0820724968901 % of misspelled Shipper.
Correct mi

In [105]:
print(f"Clean data by baseline-2:\n")
print(f"{len(list(df.columns))} columns, {len(df)} rows")
print(f"{100*df.isna().sum().sum()/(len(df)*len(df.columns))} % of missing values.")
print(f"{100*count_diff(df, cleaned_data, num_cols)/(len(df)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(df, cleaned_data, str_cols)/(len(df)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(df, cleaned_data, list(cleaned_data.columns))/(len(df)*len(cleaned_data.columns))} % of correct values.")

Time = [195.07452119421214, 84.27515762858093, 168.22918214648962, 205.53875258378685, 6.205323563888669, 169.4208493353799, 112.87846516631544, 110.88265183288604, 65.2184274001047, 375.4974009199068, 89.55069715250283, 111.01066416408867]
Iteration = [2, 2, 3, 2, 2, 1, 3, 2, 2, 2, 2, 2]

total_time = np.sum(Time)
total_iteration = np.sum(Iteration)
first_time_cnt = 0
for i in Iteration:
  if Iteration[i] == 1:
    first_time_cnt += 1
print(f"\nAverage time to generate correct code: {total_time/len(Time)} s.")
print(f"Average # of revises to generate correct code: {total_iteration/len(Iteration)}.")
print(f"Accepted first-time codes: {first_time_cnt}/{len(Iteration)}.")

Clean data by baseline-2:

26 columns, 69134 rows
7.089631952217656 % of missing values.


9.855353371712905 % of incorrect formats.
3.916488269158446 % of misspellings.

93.79933284524368 % of correct values.

Average time to generate correct code: 141.14850775734521 s.
Average # of revises to generate correct code: 2.0833333333333335.
Accepted first-time codes: 1/12.


## Baseline 3

In [144]:
df = pd.read_csv("../baseline_3/teak.csv")

In [145]:
print("\n===================== Final Version Code =====================\n")

print(f"\nColumns dropped correctly: {list(df.columns)==list(cleaned_data.columns)}")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['Arrival Date'])/len(raw_data)} % of incorrect dates.")
print(f"Clean Dates: {100*count_diff(df, cleaned_data, ['Arrival Date'])/len(df)} % of incorrect dates.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['HS Code'])/len(raw_data)} % of incorrect HS Codes.")
print(f"Clean HS Codes: {100*count_diff(df, cleaned_data, ['HS Code'])/len(df)} % of incorrect HS Codes.")

print(f"\nAdd HS4 column: {100*count_diff(df, cleaned_data, ['HS4'])/len(df)} % of incorrect values.")

print(f"\n# of missing values in Consignee before filling: {raw_data['Consignee'].isna().sum()}")
print(f"# of missing values in Consignee after filling: {df['Consignee'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Consignee'])/len(raw_data)} % of misspelled Consignee.")
print(f"Correct misspelling in Consignee : {100*count_diff(df, cleaned_data, ['Consignee'])/len(df)} % of misspelled Consignee.")

print(f"\n# of missing values in Consignee Country before filling: {raw_data['Consignee Country'].isna().sum()}")
print(f"# of missing values in Consignee after filling: {df['Consignee Country'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Consignee Country'])/len(raw_data)} % of misspelled Consignee Country.")
print(f"Correct misspelling in Consignee Country: {100*count_diff(df, cleaned_data, ['Consignee Country'])/len(df)} % of misspelled Consignee Country.")

print(f"\n# of missing values in Shipper before filling: {raw_data['Shipper'].isna().sum()}")
print(f"# of missing values in Shipper after filling: {df['Shipper'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Shipper'])/len(raw_data)} % of misspelled Shipper.")
print(f"Correct misspelling in Shipper : {100*count_diff(df, cleaned_data, ['Shipper'])/len(df)} % of misspelled Shipper.")

print(f"\n# of missing values in Shipper Country before filling: {raw_data['Shipper Country'].isna().sum()}")
print(f"# of missing values in Shipper Country after filling: {df['Shipper Country'].isna().sum()}")

print(f"\nBefore cleaning, {100*count_diff(raw_data, cleaned_data, ['Shipper Country'])/len(raw_data)} % of misspelled Shipper Country.")
print(f"Correct misspelling in Shipper Country: {100*count_diff(df, cleaned_data, ['Shipper Country'])/len(df)} % of misspelled Shipper Country.")




Columns dropped correctly: True

Before cleaning: 0.0 % of incorrect dates.
Clean Dates: 0.0 % of incorrect dates.

Before cleaning: 99.89006856250181 % of incorrect HS Codes.
Clean HS Codes: 4.72415887985651 % of incorrect HS Codes.

Add HS4 column: 5.564555790204531 % of incorrect values.

# of missing values in Consignee before filling: 14787
# of missing values in Consignee after filling: 0

Before cleaning, 33.00257470998351 % of misspelled Consignee.
Correct misspelling in Consignee : 22.329100008678797 % of misspelled Consignee.

# of missing values in Consignee Country before filling: 20939
# of missing values in Consignee after filling: 0

Before cleaning, 30.578297219891805 % of misspelled Consignee Country.
Correct misspelling in Consignee Country: 6.4772760146960975 % of misspelled Consignee Country.

# of missing values in Shipper before filling: 19933
# of missing values in Shipper after filling: 0

Before cleaning, 39.0820724968901 % of misspelled Shipper.
Correct mis

In [3]:
print(f"Clean data by baseline-2:\n")
print(f"{len(list(df.columns))} columns, {len(df)} rows")
print(f"{100*df.isna().sum().sum()/(len(df)*len(df.columns))} % of missing values.")
print(f"{100*count_diff(df, cleaned_data, num_cols)/(len(df)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(df, cleaned_data, str_cols)/(len(df)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(df, cleaned_data, list(cleaned_data.columns))/(len(df)*len(cleaned_data.columns))} % of correct values.")

Time = [514.66955789450556, 116.55049728695303, 481.37174175400287, 481.37174175400287, 201.22372320853174, 98.16172056639232, 466.15070746745914, 134.13488231692463, 840.76356662251055, 346.8426979202777, 220.65704703330994, 2760.7108397465199]
Iteration = [2, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 4]

total_time = np.sum(Time)
total_iteration = np.sum(Iteration)
first_time_cnt = 0
for i in Iteration:
  if Iteration[i] == 1:
    first_time_cnt += 1
print(f"\nAverage time to generate correct code: {total_time/len(Time)} s.")
print(f"Average # of revises to generate correct code: {total_iteration/len(Iteration)}.")
print(f"Accepted first-time codes: {first_time_cnt}/{len(Iteration)}.")

Clean data by baseline-3:

26 columns, 69134 rows
7.089631952217656 % of missing values.
1.028871467006104 % of incorrect formats.
3.916488269158446 % of misspellings.

Average time to generate correct code: 555.2173936309492 s.
Average # of revises to generate correct code: 1.4166666666666667.
Accepted first-time codes: 12/12.
