In [1]:
# load raw data and cleaned data
import pandas as pd
import numpy as np
import re
import swifter
import warnings
warnings.filterwarnings("ignore")

raw_data = pd.read_excel("../raw data/grain.xlsx")
cleaned_data = pd.read_csv("../cleaned data/grain.csv")

In [2]:
def count_diff(old, new, columns):
  old_df = old.copy().replace('', np.nan).fillna('redacted')
  new_df = new.copy().replace('', np.nan).fillna('redacted')
  correct = 0
  for col in columns:
    if col not in old_df.columns:
      continue
    for i in range(len(new_df[col])):
      if new_df[col][i] == old_df[col][i]:
        correct += 1
  return len(old_df)*len(columns) - correct

In [3]:
cleaned_data.columns

Index(['DataSource', 'Date', 'CustomsCode', 'SeqNum', 'SenderTaxID',
       'SenderNameEng', 'SenderNameRus', 'SenderAddressEng',
       'SenderAddressRus', 'SenderRegionCode', 'RecipientNameEng',
       'RecipientNameRus', 'RecipientAddressEng', 'RecipientAddressRus',
       'ContractHolderTaxID', 'ContractHolderNameEng', 'ContractHolderNameRus',
       'ContractHolderAddressEng', 'ContractHolderAddressRus',
       'ContracHolderRegionCode', 'TradingCountryCode', 'DeclarantTaxID',
       'DeclarantNameEng', 'DeclarantNameRU', 'DeclarantAddressEng',
       'DeclarantAddressRU', 'DeclarantRegionCode', 'DepartureCountryCode',
       'CountryOrigin', 'DestinationCountryCode', 'DestinationCountryEng',
       'DestinationCountryRus', 'Incoterms', 'PortUnladingEng',
       'PortLadingRus', 'ContractCurrency', 'ShipmentDescriptionEng',
       'ShipmentDescriptionEngRus', 'ManufactNameEng', 'ManufactNameRus',
       'TrademarkEng', 'TrademarkRus', 'CountryOriginShort', 'HS',
       'CountryOri

In [4]:
# num_cols: columns that should be numerical -> check for incorrect format
num_cols = ['DataSource', 'Date', 'CustomsCode', 'SeqNum', 'SenderTaxID', 'SenderRegionCode', 'ContractHolderTaxID', 'ContracHolderRegionCode', 'DeclarantTaxID', 'DeclarantRegionCode', 'HS', 'GrossWeightKG', 'NetWeightKG', 'InvoiceValue', 'StatValue_USD', 'Year']

# str_cols: columns that should be string -> check for misspellings
str_cols = list(cleaned_data.columns)
for col in num_cols:
  str_cols.remove(col)

In [5]:
print(f"In raw data:")
print(f"{len(list(raw_data.columns))} columns, {len(raw_data)} rows")
print(f"{100*raw_data.isna().sum().sum()/(len(raw_data)*len(raw_data.columns))} % of missing values.")
print(f"{100*count_diff(raw_data, cleaned_data, num_cols)/(len(raw_data)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(raw_data, cleaned_data, str_cols)/(len(raw_data)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(raw_data, cleaned_data, list(cleaned_data.columns))/(len(raw_data)*len(list(cleaned_data.columns)))} % of correct values.")

In raw data:
56 columns, 145217 rows
66.9416533286638 % of missing values.
2.841695531514905 % of incorrect formats.
18.05533940394184 % of misspellings.

85.93512456259637 % of correct values.


In [6]:
print(f"In cleaned data:")
print(f"{len(cleaned_data.columns)} columns, {len(cleaned_data)} rows")
print(f"{100*cleaned_data.isna().sum().sum()/(len(cleaned_data)*len(cleaned_data.columns))} % of missing values.")

In cleaned data:
61 columns, 145217 rows
62.10466032913773 % of missing values.


In [5]:
import datetime
import dateutil
from gensim.models import KeyedVectors
import nltk
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings("ignore")
import langdetect
import string
import re
import pycountry

## TradeSleuth

In [8]:
df = pd.read_csv("../tradesleuth/grain.csv")

In [9]:
print(f"\nColumns dropped correctly: {list(df.columns)==list(cleaned_data.columns)}")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['Date'])/len(raw_data)} % of incorrect dates.")
print(f"Clean Dates: {100*count_diff(df, cleaned_data, ['Date'])/len(df)} % of incorrect dates.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderTaxID'])/len(df)} % of incorrect SenderTaxID.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderTaxID'])/len(df)} % of incorrect SenderTaxID.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderNameEng'])/len(df)} % of misspelled SenderNameEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderNameEng'])/len(df)} % of misspelled SenderNameEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderCompanyKeywordsEng'])/len(raw_data)} % of misspelled SenderCompanyKeywordsEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderCompanyKeywordsEng'])/len(df)} % of misspelled SenderCompanyKeywordsEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderNameRus'])/len(df)} % of misspelled SenderNameRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderNameRus'])/len(df)} % of misspelled SenderNameRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderCompanyKeywordsRus'])/len(raw_data)} % of misspelled SenderCompanyKeywordsRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderCompanyKeywordsRus'])/len(df)} % of misspelled SenderCompanyKeywordsRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientNameEng'])/len(df)} % of misspelled RecipientNameEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientNameEng'])/len(df)} % of misspelled RecipientNameEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientCompanyKeywordsEng'])/len(raw_data)} % of misspelled RecipientCompanyKeywordsEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientCompanyKeywordsEng'])/len(df)} % of misspelled RecipientCompanyKeywordsEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientNameRus'])/len(df)} % of misspelled RecipientNameRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientNameRus'])/len(df)} % of misspelled RecipientNameRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientCompanyKeywordsRus'])/len(raw_data)} % of misspelled RecipientCompanyKeywordsRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientCompanyKeywordsRus'])/len(df)} % of misspelled RecipientCompanyKeywordsRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['TradingCountryCode'])/len(df)} % of misspelled TradingCountryCode.")
print(f"Clean TradingCountryCode: {100*count_diff(df, cleaned_data, ['TradingCountryCode'])/len(df)} % of misspelled TradingCountryCode.")

print(f"Add new TradingCountryEng: {100*count_diff(df, cleaned_data, ['TradingCountryEng'])/len(df)} % of misspelled TradingCountryEng.")



Columns dropped correctly: True

Before cleaning: 0.0 % of incorrect dates.
Clean Dates: 0.0 % of incorrect dates.

Before cleaning: 45.46712850423848 % of incorrect SenderTaxID.
Clean strings: 1.3097640083461304 % of incorrect SenderTaxID.

Before cleaning: 58.101324225125154 % of misspelled SenderNameEng.
Clean strings: 16.05046241142566 % of misspelled SenderNameEng.

Before cleaning: 100.0 % of misspelled SenderCompanyKeywordsEng.
Clean strings: 46.34305900824284 % of misspelled SenderCompanyKeywordsEng.

Before cleaning: 99.08550651783193 % of misspelled SenderNameRus.
Clean strings: 0.0 % of misspelled SenderNameRus.

Before cleaning: 100.0 % of misspelled SenderCompanyKeywordsRus.
Clean strings: 2.114766177513652 % of misspelled SenderCompanyKeywordsRus.

Before cleaning: 53.00756798446463 % of misspelled RecipientNameEng.
Clean strings: 30.625890908089275 % of misspelled RecipientNameEng.

Before cleaning: 100.0 % of misspelled RecipientCompanyKeywordsEng.
Clean strings: 29.57

In [10]:
print(f"Clean data by TradeSleuth:\n")
print(f"{len(list(df.columns))} columns, {len(df)} rows")
print(f"{100*df.isna().sum().sum()/(len(df)*len(df.columns))} % of missing values.")
print(f"{100*count_diff(df, cleaned_data, num_cols)/(len(df)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(df, cleaned_data, str_cols)/(len(df)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(df, cleaned_data, list(cleaned_data.columns))/(len(df)*len(cleaned_data.columns))} % of correct values.")

Time = [93.67544879950583, 139.763936647214, 295.38386161159724, 131.8704272089526, 80.32765847630799, 98.21110245306045,  24.688365072011948, 132.6472505601123, 216.22981783840805, 109.28068511374295, 95.59803623426706, 25.8256891714409, 72.3488785456866, 17.52883837558329, 112.04637297987938, 135.24875590577722, 84.50601254869252, 248.19090705085546]
Iteration = [1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1]

total_time = np.sum(Time)
total_iteration = np.sum(Iteration)
first_time_cnt = 0
for i in Iteration:
  if Iteration[i] == 1:
    first_time_cnt += 1
print(f"\nAverage time to generate correct code: {total_time/len(Time)} s.")
print(f"Average # of revises to generate correct code: {total_iteration/len(Iteration)}.")
print(f"Accepted first-time codes: {first_time_cnt}/{len(Iteration)}.")


Clean data by TradeSleuth:

61 columns, 145217 rows
63.5698390097262 % of missing values.
0.08186025052163315 % of incorrect formats.
3.2085928109120987 % of misspellings.

97.61153376230507 % of correct values.

Average time to generate correct code: 117.40955803294976 s.
Average # of revises to generate correct code: 1.3333333333333333.
Accepted first-time codes: 12/18.


## Baseline 1

In [11]:
df = pd.read_csv("../baseline_1/grain.csv")

In [12]:
print(f"\nColumns dropped correctly: {list(df.columns)==list(cleaned_data.columns)}")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['Date'])/len(raw_data)} % of incorrect dates.")
print(f"Clean Dates: {100*count_diff(df, cleaned_data, ['Date'])/len(df)} % of incorrect dates.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderTaxID'])/len(df)} % of incorrect SenderTaxID.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderTaxID'])/len(df)} % of incorrect SenderTaxID.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderNameEng'])/len(df)} % of misspelled SenderNameEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderNameEng'])/len(df)} % of misspelled SenderNameEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderCompanyKeywordsEng'])/len(raw_data)} % of misspelled SenderCompanyKeywordsEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderCompanyKeywordsEng'])/len(df)} % of misspelled SenderCompanyKeywordsEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderNameRus'])/len(df)} % of misspelled SenderNameRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderNameRus'])/len(df)} % of misspelled SenderNameRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderCompanyKeywordsRus'])/len(raw_data)} % of misspelled SenderCompanyKeywordsRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderCompanyKeywordsRus'])/len(df)} % of misspelled SenderCompanyKeywordsRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientNameEng'])/len(df)} % of misspelled RecipientNameEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientNameEng'])/len(df)} % of misspelled RecipientNameEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientCompanyKeywordsEng'])/len(raw_data)} % of misspelled RecipientCompanyKeywordsEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientCompanyKeywordsEng'])/len(df)} % of misspelled RecipientCompanyKeywordsEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientNameRus'])/len(df)} % of misspelled RecipientNameRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientNameRus'])/len(df)} % of misspelled RecipientNameRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientCompanyKeywordsRus'])/len(raw_data)} % of misspelled RecipientCompanyKeywordsRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientCompanyKeywordsRus'])/len(df)} % of misspelled RecipientCompanyKeywordsRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['TradingCountryCode'])/len(df)} % of misspelled TradingCountryCode.")
print(f"Clean TradingCountryCode: {100*count_diff(df, cleaned_data, ['TradingCountryCode'])/len(df)} % of misspelled TradingCountryCode.")

print(f"Add new TradingCountryEng: {100*count_diff(df, cleaned_data, ['TradingCountryEng'])/len(df)} % of misspelled TradingCountryEng.")



Columns dropped correctly: False

Before cleaning: 0.0 % of incorrect dates.
Clean Dates: 100.0 % of incorrect dates.

Before cleaning: 45.46712850423848 % of incorrect SenderTaxID.
Clean strings: 54.86341130859335 % of incorrect SenderTaxID.

Before cleaning: 58.101324225125154 % of misspelled SenderNameEng.
Clean strings: 50.79295123849136 % of misspelled SenderNameEng.

Before cleaning: 100.0 % of misspelled SenderCompanyKeywordsEng.
Clean strings: 48.09974038852201 % of misspelled SenderCompanyKeywordsEng.

Before cleaning: 99.08550651783193 % of misspelled SenderNameRus.
Clean strings: 97.82463485680051 % of misspelled SenderNameRus.

Before cleaning: 100.0 % of misspelled SenderCompanyKeywordsRus.
Clean strings: 6.936515697197986 % of misspelled SenderCompanyKeywordsRus.

Before cleaning: 53.00756798446463 % of misspelled RecipientNameEng.
Clean strings: 52.68391441773346 % of misspelled RecipientNameEng.

Before cleaning: 100.0 % of misspelled RecipientCompanyKeywordsEng.
Clean

In [13]:
print(f"Clean data by baseline-1:\n")
print(f"{len(list(df.columns))} columns, {len(df)} rows")
print(f"{100*df.isna().sum().sum()/(len(df)*len(df.columns))} % of missing values.")
print(f"{100*count_diff(df, cleaned_data, num_cols)/(len(df)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(df, cleaned_data, str_cols)/(len(df)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(df, cleaned_data, list(cleaned_data.columns))/(len(df)*len(cleaned_data.columns))} % of correct values.")

Time = [42.153190053999424, 122.69757836963981, 297.48250592593104, 95.18260069657117, 65.96513758599758, 194.2544194124639, 137.84166275989264, 44.167533341795206, 62.70578438322991, 70.22155686840415, 159.21151913143694, 69.0654486157, 41.72477652877569, 100.52077276725322, 231.55241945479065, 96.9347566459328, 42.15626517217606, 13.758710413239896]
Iteration = [1, 3, 4, 2, 2, 4, 3, 1, 2, 2, 3, 2, 1, 2, 2, 2, 1, 1]

total_time = np.sum(Time)
total_iteration = np.sum(Iteration)
first_time_cnt = 0
for i in Iteration:
  if Iteration[i] == 1:
    first_time_cnt += 1
print(f"\nAverage time to generate correct code: {total_time/len(Time)} s.")
print(f"Average # of revises to generate correct code: {total_iteration/len(Iteration)}.")
print(f"Accepted first-time codes: {first_time_cnt}/{len(Iteration)}.")

Clean data by baseline-1:

61 columns, 145217 rows
3.7596081477612304 % of missing values.
9.678963206787085 % of incorrect formats.
14.430327639938085 % of misspellings.

86.81593188351135 % of correct values.

Average time to generate correct code: 104.86647989595723 s.
Average # of revises to generate correct code: 2.111111111111111.
Accepted first-time codes: 5/18.


## Baseline 2

In [14]:
df = pd.read_csv("../baseline_2/grain.csv")

In [15]:
print(f"\nColumns dropped correctly: {list(df.columns)==list(cleaned_data.columns)}")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['Date'])/len(raw_data)} % of incorrect dates.")
print(f"Clean Dates: {100*count_diff(df, cleaned_data, ['Date'])/len(df)} % of incorrect dates.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderTaxID'])/len(df)} % of incorrect SenderTaxID.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderTaxID'])/len(df)} % of incorrect SenderTaxID.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderNameEng'])/len(df)} % of misspelled SenderNameEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderNameEng'])/len(df)} % of misspelled SenderNameEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderCompanyKeywordsEng'])/len(raw_data)} % of misspelled SenderCompanyKeywordsEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderCompanyKeywordsEng'])/len(df)} % of misspelled SenderCompanyKeywordsEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderNameRus'])/len(df)} % of misspelled SenderNameRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderNameRus'])/len(df)} % of misspelled SenderNameRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderCompanyKeywordsRus'])/len(raw_data)} % of misspelled SenderCompanyKeywordsRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderCompanyKeywordsRus'])/len(df)} % of misspelled SenderCompanyKeywordsRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientNameEng'])/len(df)} % of misspelled RecipientNameEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientNameEng'])/len(df)} % of misspelled RecipientNameEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientCompanyKeywordsEng'])/len(raw_data)} % of misspelled RecipientCompanyKeywordsEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientCompanyKeywordsEng'])/len(df)} % of misspelled RecipientCompanyKeywordsEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientNameRus'])/len(df)} % of misspelled RecipientNameRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientNameRus'])/len(df)} % of misspelled RecipientNameRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientCompanyKeywordsRus'])/len(raw_data)} % of misspelled RecipientCompanyKeywordsRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientCompanyKeywordsRus'])/len(df)} % of misspelled RecipientCompanyKeywordsRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['TradingCountryCode'])/len(df)} % of misspelled TradingCountryCode.")
print(f"Clean TradingCountryCode: {100*count_diff(df, cleaned_data, ['TradingCountryCode'])/len(df)} % of misspelled TradingCountryCode.")

print(f"Add new TradingCountryEng: {100*count_diff(df, cleaned_data, ['TradingCountryEng'])/len(df)} % of misspelled TradingCountryEng.")



Columns dropped correctly: True

Before cleaning: 0.0 % of incorrect dates.
Clean Dates: 0.0 % of incorrect dates.

Before cleaning: 45.46712850423848 % of incorrect SenderTaxID.
Clean strings: 1.4619500471707858 % of incorrect SenderTaxID.

Before cleaning: 58.101324225125154 % of misspelled SenderNameEng.
Clean strings: 5.219774544302664 % of misspelled SenderNameEng.

Before cleaning: 100.0 % of misspelled SenderCompanyKeywordsEng.
Clean strings: 47.460696750380464 % of misspelled SenderCompanyKeywordsEng.

Before cleaning: 99.08550651783193 % of misspelled SenderNameRus.
Clean strings: 1.0274279182189414 % of misspelled SenderNameRus.

Before cleaning: 100.0 % of misspelled SenderCompanyKeywordsRus.
Clean strings: 2.114766177513652 % of misspelled SenderCompanyKeywordsRus.

Before cleaning: 53.00756798446463 % of misspelled RecipientNameEng.
Clean strings: 1.6444355688383592 % of misspelled RecipientNameEng.

Before cleaning: 100.0 % of misspelled RecipientCompanyKeywordsEng.
Clea

In [16]:
print(f"Clean data by baseline-2:\n")
print(f"{len(list(df.columns))} columns, {len(df)} rows")
print(f"{100*df.isna().sum().sum()/(len(df)*len(df.columns))} % of missing values.")
print(f"{100*count_diff(df, cleaned_data, num_cols)/(len(df)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(df, cleaned_data, str_cols)/(len(df)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(df, cleaned_data, list(cleaned_data.columns))/(len(df)*len(cleaned_data.columns))} % of correct values.")

Time = [82.79690059833229, 74.76355464849621, 341.520237961784, 278.81876710243523, 155.45799873024225, 197.6665946384892, 269.72608721535653, 76.24833997152746, 90.93382281530648, 173.23573657125235, 152.10720858257264, 20.26302487310022, 147.44158242456615, 106.10498212091625, 29.529840392060578, 100.52237946912646, 62.34795117378235, 114.19855739269406]
Iteration = [2, 1, 7, 3, 2, 3, 2, 1, 1, 2, 2, 3, 2, 2, 2, 1, 2, 1]

total_time = np.sum(Time)
total_iteration = np.sum(Iteration)
first_time_cnt = 0
for i in Iteration:
  if Iteration[i] == 1:
    first_time_cnt += 1
print(f"\nAverage time to generate correct code: {total_time/len(Time)} s.")
print(f"Average # of revises to generate correct code: {total_iteration/len(Iteration)}.")
print(f"Accepted first-time codes: {first_time_cnt}/{len(Iteration)}.")

Clean data by baseline-2:

61 columns, 145217 rows
63.06803486969247 % of missing values.
0.09137187794817411 % of incorrect formats.
2.3714701293772613 % of misspellings.

98.22658842837463 % of correct values.

Average time to generate correct code: 137.42686481566892 s.
Average # of revises to generate correct code: 2.1666666666666665.
Accepted first-time codes: 5/18.


## Baseline 3

In [25]:
df = pd.read_csv("../baseline_3/grain.csv")

In [26]:
print(f"\nColumns dropped correctly: {list(df.columns)==list(cleaned_data.columns)}")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['Date'])/len(raw_data)} % of incorrect dates.")
print(f"Clean Dates: {100*count_diff(df, cleaned_data, ['Date'])/len(df)} % of incorrect dates.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderTaxID'])/len(df)} % of incorrect SenderTaxID.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderTaxID'])/len(df)} % of incorrect SenderTaxID.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderNameEng'])/len(df)} % of misspelled SenderNameEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderNameEng'])/len(df)} % of misspelled SenderNameEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderCompanyKeywordsEng'])/len(raw_data)} % of misspelled SenderCompanyKeywordsEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderCompanyKeywordsEng'])/len(df)} % of misspelled SenderCompanyKeywordsEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderNameRus'])/len(df)} % of misspelled SenderNameRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderNameRus'])/len(df)} % of misspelled SenderNameRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['SenderCompanyKeywordsRus'])/len(raw_data)} % of misspelled SenderCompanyKeywordsRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['SenderCompanyKeywordsRus'])/len(df)} % of misspelled SenderCompanyKeywordsRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientNameEng'])/len(df)} % of misspelled RecipientNameEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientNameEng'])/len(df)} % of misspelled RecipientNameEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientCompanyKeywordsEng'])/len(raw_data)} % of misspelled RecipientCompanyKeywordsEng.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientCompanyKeywordsEng'])/len(df)} % of misspelled RecipientCompanyKeywordsEng.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientNameRus'])/len(df)} % of misspelled RecipientNameRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientNameRus'])/len(df)} % of misspelled RecipientNameRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['RecipientCompanyKeywordsRus'])/len(raw_data)} % of misspelled RecipientCompanyKeywordsRus.")
print(f"Clean strings: {100*count_diff(df, cleaned_data, ['RecipientCompanyKeywordsRus'])/len(df)} % of misspelled RecipientCompanyKeywordsRus.")

print(f"\nBefore cleaning: {100*count_diff(raw_data, cleaned_data, ['TradingCountryCode'])/len(df)} % of misspelled TradingCountryCode.")
print(f"Clean TradingCountryCode: {100*count_diff(df, cleaned_data, ['TradingCountryCode'])/len(df)} % of misspelled TradingCountryCode.")

print(f"Add new TradingCountryEng: {100*count_diff(df, cleaned_data, ['TradingCountryEng'])/len(df)} % of misspelled TradingCountryEng.")



Columns dropped correctly: True

Before cleaning: 0.0 % of incorrect dates.
Clean Dates: 0.0 % of incorrect dates.

Before cleaning: 45.46712850423848 % of incorrect SenderTaxID.
Clean strings: 1.3097640083461304 % of incorrect SenderTaxID.

Before cleaning: 58.101324225125154 % of misspelled SenderNameEng.
Clean strings: 1.5969204707437834 % of misspelled SenderNameEng.

Before cleaning: 100.0 % of misspelled SenderCompanyKeywordsEng.
Clean strings: 46.34305900824284 % of misspelled SenderCompanyKeywordsEng.

Before cleaning: 99.08550651783193 % of misspelled SenderNameRus.
Clean strings: 0.0 % of misspelled SenderNameRus.

Before cleaning: 100.0 % of misspelled SenderCompanyKeywordsRus.
Clean strings: 2.114766177513652 % of misspelled SenderCompanyKeywordsRus.

Before cleaning: 53.00756798446463 % of misspelled RecipientNameEng.
Clean strings: 14.472823429763732 % of misspelled RecipientNameEng.

Before cleaning: 100.0 % of misspelled RecipientCompanyKeywordsEng.
Clean strings: 29.5

In [27]:
print(f"Clean data by baseline-3:\n")
print(f"{len(list(df.columns))} columns, {len(df)} rows")
print(f"{100*df.isna().sum().sum()/(len(df)*len(df.columns))} % of missing values.")
print(f"{100*count_diff(df, cleaned_data, num_cols)/(len(df)*len(num_cols))} % of incorrect formats.")
print(f"{100*count_diff(df, cleaned_data, str_cols)/(len(df)*len(str_cols))} % of misspellings.")
print(f"\n{100-100*count_diff(df, cleaned_data, list(cleaned_data.columns))/(len(df)*len(cleaned_data.columns))} % of correct values.")

Time = [770.5227210589801, 38.336533748020884, 245.21183773397934, 544.7312183660106, 279.3751202769927, 663.6051533879945, 38.2295560730272, 147.3523692509625, 482.5904520439799, 394.9338829119806, 329.95859223697335, 28.52144992700778, 127.52958833199227, 278.80991287797224, 512.4843540079892, 952.2394041509833, 173.6739858740475, 651.7018963759765]
Iteration = [1, 1, 2, 2, 1, 1, 1, 3, 2, 1, 1, 1, 2, 1, 1, 2, 1, 1]

total_time = np.sum(Time)
total_iteration = np.sum(Iteration)
first_time_cnt = 0
for i in Iteration:
  if Iteration[i] == 1:
    first_time_cnt += 1
print(f"\nAverage time to generate correct code: {total_time/len(Time)} s.")
print(f"Average # of revises to generate correct code: {total_iteration/len(Iteration)}.")
print(f"Accepted first-time codes: {first_time_cnt}/{len(Iteration)}.")

Clean data by baseline-3:

61 columns, 145217 rows
66.22382083477784 % of missing values.
0.08186025052163315 % of incorrect formats.
6.645931414519114 % of misspellings.

95.07579216948022 % of correct values.

Average time to generate correct code: 369.9893349241595 s.
Average # of revises to generate correct code: 1.3888888888888888.
Accepted first-time codes: 12/18.
