Before running the evaluation experiments, install lux and import necessary libraries.

In [1]:
!pip install lux-api



In [68]:
from google.colab import output
output.enable_custom_widget_manager()

In [69]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [70]:
import lux
import pandas as pd
import numpy as np
import random

# Evaluation Metrics

First, define several helper functions to convert Lux visual recommendations to matplotlib plots, to calculate numpy array representations of these plots, and compute evaluation metrics between these arrays. Use functions in the sewar library to compute mse, rmse, and ssim between two images.

In [71]:
!pip install sewar



In [72]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from math import nan
from PIL import Image
from io import BytesIO

def get_image_from_figure(fig, dpi=180):
    buf = BytesIO()
    fig.savefig(buf, format="png", dpi=dpi)
    buf.seek(0)
    img_arr = np.frombuffer(buf.getvalue(), dtype=np.uint8)
    buf.close()
    img = Image.open(BytesIO(img_arr))
    return np.array(img)

def get_image_from_visual(visual):
    # Execute the code string
    code_str = visual.to_matplotlib()
    new_str = "import pandas as pd\n" + code_str
    exec_globals = {}
    exec(new_str, exec_globals)

    # Extract the fig object
    fig = exec_globals['fig']
    image = get_image_from_figure(fig)
    return image

In [73]:
from sewar.full_ref import mse, rmse, psnr, ssim

def compute_metrics_between_two_visuals(visual1, visual2):
    image1 = get_image_from_visual(visual1)
    image2 = get_image_from_visual(visual2)

    mse_value = mse(image1, image2)
    rmse_value = rmse(image1, image2)
    ssim_value = ssim(image1, image2)

    return mse_value, rmse_value, ssim_value

def compute_metrics_between_two_visual_lists(visual_list1, visual_list2):
    mse_values = []
    rmse_values = []
    ssim_values = []

    for i in range(len(visual_list1)):
        mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(visual_list1[i], visual_list2[i])
        mse_values.append(mse_value)
        rmse_values.append(rmse_value)
        ssim_values.append(ssim_value)
        print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}")

    return mse_values, rmse_values, ssim_values

def compute_average_metrics(mse_values, rmse_values, ssim_values):
    avg_mse_value = sum(mse_values) / len(mse_values)
    avg_rmse_value = sum(rmse_values) / len(rmse_values)
    avg_lower_ssim = sum(ssim[0] for ssim in ssim_values) / len(ssim_values)
    avg_upper_ssim = sum(ssim[1] for ssim in ssim_values) / len(ssim_values)

    return avg_mse_value, avg_rmse_value, (avg_lower_ssim, avg_upper_ssim)

# Dirty Dataset

Load the Airbnb dirty dataset, which consists of missing values, outliers, and duplicates.

In [74]:
DATA_PATH = "/content/drive/MyDrive/LuxVisualization/datasets/Airbnb/"
dirty_df = pd.read_csv(DATA_PATH + "dirty_train.csv")
dirty_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [75]:
dirty_visuals = dirty_df.recommendation

In [76]:
dirty_df.isna().sum()

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

# Cleaned Datasets

## Missing Values

We used five different cleaning methods to impute missing values: (1)-(4) mean, mediam, mode, KNN imputation for numeric missing values, respectively and mode imputation for categorical missing values, (5) Holoclean automatic cleaning.

### 1. Mean_Mode

In [77]:
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

def impute_mv_mean_mode(df):
    """
    Impute missing numeric values with the mean and categorical missing values with the mode.

    :param df: Pandas DataFrame.
    :return: DataFrame with missing values imputed.
    """
    df_copy = df.copy()
    for column in df_copy.columns:
        if is_numeric_dtype(df_copy[column]):
            mean = df_copy[column].mean()
            df_copy[column].fillna(mean, inplace=True)
        elif is_categorical_dtype(df_copy[column]) or df_copy[column].dtype == 'object':
            mode = df_copy[column].mode()[0]
            df_copy[column].fillna(mode, inplace=True)
    return df_copy

In [78]:
mv_mean_mode_df = impute_mv_mean_mode(dirty_df)
mv_mean_mode_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [79]:
mv_mean_mode_visuals = mv_mean_mode_df.recommendation

Compute the evaluation metrics between visualizations of dirty and cleaned datasets. The `rank` in this case is the rank of corresponding plot after data cleaning. For instance, if the plot has rank i before data cleaning (i.e. at the i^th position), then rank[i] is the new rank of that plot after data cleaning.

In [80]:
mse11 = []
rmse11 = []
ssim11 = []

rank = [1, 0, 2, 3, 6, 7, 4, 5, 8, 9, 10, 11, 12, 13, 14]
for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], mv_mean_mode_visuals['Correlation'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse11.append(mse_value)
  rmse11.append(rmse_value)
  ssim11.append(ssim_value)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0005123939043209877, RMSE= 0.022636119462509197, SSIM= (0.9999997342177593, 0.9999997460082711)

MSE= 0.0005123939043209877, RMSE= 0.022636119462509197, SSIM= (0.9999997342177593, 0.9999997460082711)

MSE= 2.12817684220679, RMSE= 1.4588272146511354, SSIM= (0.9996267839276831, 0.9996376861556007)

MSE= 2.12817684220679, RMSE= 1.4588272146511354, SSIM= (0.9996267839276831, 0.9996376861556007)

MSE= 1.7046064211998457, RMSE= 1.3056057679099942, SSIM= (0.9996528440323982, 0.9996598921058859)

MSE= 1.8335262345679013, RMSE= 1.354077632400706, SSIM= (0.9996676607332482, 0.9996764708563354)

MSE= 1.7477816358024691, RMSE= 1.322036926792315, SSIM= (0.9996706002579632, 0.9996789773580725)

MSE= 1.3966411072530864, RMSE= 1.1817957129948842, SSIM= (0.9996517977191584, 0.9996587403568292)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 1.8335262345679013, RMSE= 1.354077632400706, SSIM= (0.9996676607332482, 0.99967647

In [81]:
mse12 = []
rmse12 = []
ssim12 = []

rank = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 20, 18, 21,
        19, 23, 24, 14, 25, 26, 27, 28, 22, 29, 30, 31, 32]
for i in range(33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], mv_mean_mode_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse12.append(mse_value)
  rmse12.append(rmse_value)
  ssim12.append(ssim_value)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 252.5175723480099, RMSE= 15.890801501120386, SSIM= (0.9822303731678863, 0.9822608917063255)

MSE= 1.2876390998746141, RMSE= 1.1347418648638175, SSIM= (0.9997824343925665, 0.9997846184983438)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 5.1505563994984565, RMSE= 2.269483729727635, SSIM= (0.9993621868708984, 0.9993713902409065)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 25.752781997492285, RMSE= 5.0747198935007525, SSIM= (0.9985887461903799, 0.9987519019588331)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 3.806932990933642, RMSE= 1.9511363332513805, SSIM= (0.9994850813273203, 0.9994928920691588)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

M

In [82]:
import json

mv_mean_mode_dict1 = {"mse": mse11, "rmse": rmse11, "ssim": ssim11}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_mean_mode_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(mv_mean_mode_dict1, file)

mv_mean_mode_dict2 = {"mse": mse12, "rmse": rmse12, "ssim": ssim12}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_mean_mode_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(mv_mean_mode_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


### 2. Median_Mode

In [83]:
import pandas as pd
from pandas.api.types import is_numeric_dtype, is_categorical_dtype

def impute_mv_median_mode(df):
    """
    Impute missing numeric values with the median and categorical missing values with the mode.

    :param df: Pandas DataFrame.
    :return: DataFrame with missing values imputed.
    """
    df_copy = df.copy()
    for column in df_copy.columns:
        if is_numeric_dtype(df_copy[column]):
            median = df_copy[column].median()
            df_copy[column].fillna(median, inplace=True)
        elif is_categorical_dtype(df_copy[column]) or df_copy[column].dtype == 'object':
            mode = df_copy[column].mode()[0]
            df_copy[column].fillna(mode, inplace=True)
    return df_copy

In [84]:
mv_median_mode_df = impute_mv_median_mode(dirty_df)
mv_median_mode_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [85]:
mv_median_mode_visuals = mv_median_mode_df.recommendation

In [86]:
mse21 = []
rmse21 = []
ssim21 = []

rank = [1, 0, 2, 3, 6, 7, 4, 5, 8, 9, 10, 11, 12, 13, 14]
for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], mv_median_mode_visuals['Correlation'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse21.append(mse_value)
  rmse21.append(rmse_value)
  ssim21.append(ssim_value)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0005123939043209877, RMSE= 0.022636119462509197, SSIM= (0.9999997342177593, 0.9999997460082711)

MSE= 0.0005123939043209877, RMSE= 0.022636119462509197, SSIM= (0.9999997342177593, 0.9999997460082711)

MSE= 2.12817684220679, RMSE= 1.4588272146511354, SSIM= (0.9996267839276831, 0.9996376861556007)

MSE= 2.12817684220679, RMSE= 1.4588272146511354, SSIM= (0.9996267839276831, 0.9996376861556007)

MSE= 1.7046064211998457, RMSE= 1.3056057679099942, SSIM= (0.9996528440323982, 0.9996598921058859)

MSE= 1.8335262345679013, RMSE= 1.354077632400706, SSIM= (0.9996676607332482, 0.9996764708563354)

MSE= 1.7477816358024691, RMSE= 1.322036926792315, SSIM= (0.9996706002579632, 0.9996789773580725)

MSE= 1.3966411072530864, RMSE= 1.1817957129948842, SSIM= (0.9996517977191584, 0.9996587403568292)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 1.8335262345679013, RMSE= 1.354077632400706, SSIM= (0.9996676607332482, 0.99967647

In [87]:
mse22 = []
rmse22 = []
ssim22 = []

rank = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 18, 19, 21, 20, 22,
        17, 24, 15, 14, 25, 26, 27, 28, 23, 29, 30, 31, 32]
for i in range(33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], mv_median_mode_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse22.append(mse_value)
  rmse22.append(rmse_value)
  ssim22.append(ssim_value)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 252.5175723480099, RMSE= 15.890801501120386, SSIM= (0.9822303731678863, 0.9822608917063255)

MSE= 34.917429982880016, RMSE= 5.909097222324237, SSIM= (0.9970452411625828, 0.997054157778495)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 118.75486020688658, RMSE= 10.897470358155905, SSIM= (0.9913784966235689, 0.9914014627694159)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 348.6508688593107, RMSE= 18.672195073405558, SSIM= (0.9760090542386525, 0.9760542525303011)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 69.98203255610211, RMSE= 8.365526436280152, SSIM= (0.994582325745882, 0.9946079503590893)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 

In [88]:
import json

mv_median_mode_dict1 = {"mse": mse21, "rmse": rmse21, "ssim": ssim21}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_median_mode_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(mv_median_mode_dict1, file)

mv_median_mode_dict2 = {"mse": mse22, "rmse": rmse22, "ssim": ssim22}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_median_mode_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(mv_median_mode_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


### 3. Mode_Mode

In [89]:
import pandas as pd

def impute_mv_mode_mode(df):
    """
    Impute missing numeric values with the mode and categorical missing values with the mode.

    :param df: Pandas DataFrame.
    :return: DataFrame with missing values imputed.
    """
    df_copy = df.copy()
    for column in df_copy.columns:
        mode = df_copy[column].mode()[0]
        df_copy[column].fillna(mode, inplace=True)
    return df_copy

In [90]:
mv_mode_mode_df = impute_mv_mode_mode(dirty_df)
mv_mode_mode_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [91]:
mv_mode_mode_visuals = mv_mode_mode_df.recommendation

In [94]:
mse31 = []
rmse31 = []
ssim31 = []

rank = [1, 0, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]
for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], mv_mode_mode_visuals['Correlation'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse31.append(mse_value)
  rmse31.append(rmse_value)
  ssim31.append(ssim_value)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0038150740258487653, RMSE= 0.0617662855111813, SSIM= (0.9999993808153534, 0.9999995281779429)

MSE= 0.0061773606288580245, RMSE= 0.07859618711399444, SSIM= (0.9999960781508699, 0.9999964382371348)

MSE= 0.0061773606288580245, RMSE= 0.07859618711399444, SSIM= (0.9999960781508699, 0.9999964382371348)

MSE= 0.2838179976851852, RMSE= 0.5327457157830414, SSIM= (0.9999804165798292, 0.9999867496322451)

MSE= 0.2838179976851852, RMSE= 0.5327457157830414, SSIM= (0.9999804165798292, 0.9999867496322451)

MSE= 0.006580745241769547, RMSE= 0.08112179264395941, SSIM= (0.9999947660183919, 0.9999953387781966)

MSE= 0.006499616206918724, RMSE= 0.08062019726420126, SSIM= (0.9999942548147982, 0.9999947784262551)

MSE= 0.009127644354423868, RMSE= 0.09553870605374487, SSIM= (0.9999942924183539, 0.9999946930070324)

MSE= 0.00251977237654321, RMSE= 0.05019733435694778, SSIM= (0.9999989287648916, 0.9999992241934281)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.00

In [95]:
mse32 = []
rmse32 = []
ssim32 = []

rank = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 20, 16, 21,
        19, 22, 23, 14, 24, 26, 27, 28, 32, 25, 29, 30, 31]
for i in range(33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], mv_mode_mode_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse32.append(mse_value)
  rmse32.append(rmse_value)
  ssim32.append(ssim_value)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 252.5175723480099, RMSE= 15.890801501120386, SSIM= (0.9822303731678863, 0.9822608917063255)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 5.1505563994984565, RMSE= 2.269483729727635, SSIM= (0.9993621868708984, 0.9993713902409065)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 348.6508688593107, RMSE= 18.672195073405558, SSIM= (0.9760090542386525, 0.9760542525303011)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 69.98203255610211, RMSE= 8.365526436280152, SSIM= (0.994582325745882, 0.9946079503590893)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 745.3578076774692, RMSE= 27.301241870608546, SSIM= (0.96415

In [96]:
import json

mv_mode_mode_dict1 = {"mse": mse31, "rmse": rmse31, "ssim": ssim31}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_mode_mode_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(mv_mode_mode_dict1, file)

mv_mode_mode_dict2 = {"mse": mse32, "rmse": rmse32, "ssim": ssim32}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_mode_mode_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(mv_mode_mode_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


### 4. KNNImputer_Mode

In [97]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

def impute_mv_knn_mode(df):
    """
    Impute missing numeric values with knn and categorical missing values with the mode.

    :param df: Pandas DataFrame.
    :return: DataFrame with missing values imputed.
    """
    df_copy = df.copy()

    # Separate numeric and categorical columns
    numeric_cols = df_copy.select_dtypes(include=[np.number]).columns
    categorical_cols = df_copy.select_dtypes(include=['object']).columns

    # Impute numeric columns with KNNImputer
    imputer = KNNImputer(n_neighbors=5)
    df_copy[numeric_cols] = imputer.fit_transform(df_copy[numeric_cols])

    # Impute categorical columns with mode
    for col in categorical_cols:
        mode = df_copy[col].mode()[0]
        df_copy[col].fillna(mode, inplace=True)

    return df_copy

In [98]:
mv_knn_mode_df = impute_mv_knn_mode(dirty_df)
mv_knn_mode_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [99]:
mv_knn_mode_visuals = mv_knn_mode_df.recommendation

In [100]:
mse41 = []
rmse41 = []
ssim41 = []

for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], mv_knn_mode_visuals['Correlation'][i])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse41.append(mse_value)
  rmse41.append(rmse_value)
  ssim41.append(ssim_value)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.007694950810185185, RMSE= 0.08772086872680403, SSIM= (0.9999991300958958, 0.9999992873039215)

MSE= 0.0022876880787037037, RMSE= 0.04782978234012469, SSIM= (0.9999996429902887, 0.999999759162714)

MSE= 0.0022876880787037037, RMSE= 0.04782978234012469, SSIM= (0.9999996429902887, 0.999999759162714)

MSE= 2.0395028071148404, RMSE= 1.4281116227784298, SSIM= (0.9996752990287001, 0.9996855760215934)

MSE= 2.0395028071148404, RMSE= 1.4281116227784298, SSIM= (0.9996752990287001, 0.9996855760215934)

MSE= 1.8048773871527777, RMSE= 1.3434572517027765, SSIM= (0.9997709980107814, 0.9997842198028541)

MSE= 1.757129308127572, RMSE= 1.3255675418957618, SSIM= (0.9997633636468473, 0.9997748518786046)

MSE= 1.6749574009773662, RMSE= 1.2942014530116115, SSIM= (0.9997655796219231, 0.9997764967793881)

MSE= 1.3384477277842077, RMSE= 1.1569130165160246, SSIM= (0.9997643540005653, 0.9997731639364724)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0742227687757201

In [101]:
mse42 = []
rmse42 = []
ssim42 = []

rank = [0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 21, 22, 24, 27, 25,
        23, 28, 20, 19, 29, 31, 32, 33, 26, 30, 34, 35, 36]
for i in range(33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], mv_knn_mode_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse42.append(mse_value)
  rmse42.append(rmse_value)
  ssim42.append(ssim_value)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 245.3955525213799, RMSE= 15.66510620842961, SSIM= (0.9828225132400444, 0.9828516584783495)

MSE= 34.917429982880016, RMSE= 5.909097222324237, SSIM= (0.9970452411625828, 0.997054157778495)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 93.42363823784723, RMSE= 9.665590423654793, SSIM= (0.9930079415383601, 0.9930278638942013)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 267.3633056138278, RMSE= 16.351247830481555, SSIM= (0.9817314077035947, 0.9817682305919553)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 3.8629172996238426, RMSE= 1.9654305634195888, SSIM= (0.9995725891054347, 0.9995784834905741)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 

In [102]:
import json

mv_knn_mode_dict1 = {"mse": mse41, "rmse": rmse41, "ssim": ssim41}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_knn_mode_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(mv_knn_mode_dict1, file)

mv_knn_mode_dict2 = {"mse": mse42, "rmse": rmse42, "ssim": ssim42}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_knn_mode_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(mv_knn_mode_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


### 5. HoloClean

In [103]:
mv_hc_df = pd.read_csv(DATA_PATH + "Holoclean_mv_clean.csv")
mv_hc_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [104]:
mv_hc_visuals = mv_hc_df.recommendation

In [105]:
mse51 = []
rmse51 = []
ssim51 = []

rank = [0, 1, 2, 3, 7, 5, 4, 6, 8, 9, 10, 11, 12, 14, 3]
for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], mv_hc_visuals['Correlation'][i])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse51.append(mse_value)
  rmse51.append(rmse_value)
  ssim51.append(ssim_value)

MSE= 0.3906154554076646, RMSE= 0.6249923642794883, SSIM= (0.9998506993438807, 0.9998581058367573)

MSE= 0.10928884749549897, RMSE= 0.3305886378802196, SSIM= (0.9998362012763216, 0.9998380892663545)

MSE= 4.132222493489583, RMSE= 2.0327868785215983, SSIM= (0.998531051286949, 0.9985742683491654)

MSE= 3.6741664556809415, RMSE= 1.916811533688417, SSIM= (0.9989331016222781, 0.9989746996127532)

MSE= 1195.137135195634, RMSE= 34.57075549066919, SSIM= (0.9012393983460946, 0.9020518469938034)

MSE= 3.01092730235661, RMSE= 1.7352023808065185, SSIM= (0.999127656226533, 0.999171670871042)

MSE= 1189.3918625317483, RMSE= 34.487560982646315, SSIM= (0.9053668649716836, 0.906428761631746)

MSE= 803.6542408633134, RMSE= 28.34879610959367, SSIM= (0.9304196974157842, 0.9318459136387895)

MSE= 4.04255054615162, RMSE= 2.010609496185577, SSIM= (0.9988378661724677, 0.9988842631603067)

MSE= 14.40076386879501, RMSE= 3.794833839418402, SSIM= (0.9974064657563446, 0.9977236253438171)

MSE= 1.7662951308513375, R

In [107]:
mse52 = []
rmse52 = []
ssim52 = []

rank = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 ,16, 17, 18, 20, 19,
          15, 21, 29, 23, 24, 25, 28, 26, 22, 27, 30, 31, 32]
for i in range(33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], mv_hc_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse52.append(mse_value)
  rmse52.append(rmse_value)
  ssim52.append(ssim_value)

MSE= 371.3743564935378, RMSE= 19.27107564443505, SSIM= (0.9724447833779476, 0.9724617350227834)

MSE= 434.2161141854745, RMSE= 20.837852916878806, SSIM= (0.9680650935926617, 0.9680886230925506)

MSE= 377.18439022111306, RMSE= 19.421235548262963, SSIM= (0.9719045437126577, 0.9719243099934672)

MSE= 335.64103114752123, RMSE= 18.320508484960815, SSIM= (0.9748796157629418, 0.9748975839747772)

MSE= 343.9570684236754, RMSE= 18.546079597146008, SSIM= (0.9746458667446636, 0.9746649246280328)

MSE= 398.6187957638085, RMSE= 19.965440034314508, SSIM= (0.9703592784156525, 0.9703928989319457)

MSE= 412.1871639298804, RMSE= 20.30239305919084, SSIM= (0.969426984543339, 0.9694654326277466)

MSE= 403.0320512434092, RMSE= 20.07565817709121, SSIM= (0.9706034047670465, 0.9706324547830936)

MSE= 407.2287401740934, RMSE= 20.179909320264386, SSIM= (0.9704554464794581, 0.9704840783574662)

MSE= 405.26978393052343, RMSE= 20.131313517267657, SSIM= (0.9708136978214352, 0.9708435111189451)

MSE= 399.023768044303

In [108]:
import json

mv_hc_dict1 = {"mse": mse51, "rmse": rmse51, "ssim": ssim51}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_hc_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(mv_hc_dict1, file)

mv_hc_dict2 = {"mse": mse52, "rmse": rmse52, "ssim": ssim52}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/mv_hc_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(mv_hc_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


## Outliers

Next, we detect outliers with the Z-score method (i.e. those values that are >=3 standard deviation away from mean) and replace outliers with mean, median, mode, respectively. We also used Holoclean for automatic data cleaning.

In [109]:
import pandas as pd
import numpy as np
from scipy.stats import mode

def replace_outliers_with_mean(df):
    """
    Detect and replace outliers in numeric columns of a DataFrame.
    Outliers are defined as values more than 3 standard deviations from the mean.

    :param df: Pandas DataFrame
    :return: DataFrame with outliers replaced by mean
    """
    df_copy = df.copy()
    for col in df_copy.select_dtypes(include=[np.number]):
        mean = df_copy[col].mean()
        std = df_copy[col].std()
        outliers = (df_copy[col] - mean).abs() >= 3 * std
        df_copy.loc[outliers, col] = mean
    return df_copy

def replace_outliers_with_median(df):
    """
    Detect and replace outliers in numeric columns of a DataFrame.
    Outliers are defined as values more than 3 standard deviations from the mean.

    :param df: Pandas DataFrame
    :return: DataFrame with outliers replaced by median
    """
    df_copy = df.copy()
    for col in df_copy.select_dtypes(include=[np.number]):
        mean = df_copy[col].mean()
        std = df_copy[col].std()
        median = df_copy[col].median()
        outliers = (df_copy[col] - mean).abs() >= 3 * std
        df_copy.loc[outliers, col] = median
    return df_copy

def replace_outliers_with_mode(df):
    """
    Detect and replace outliers in numeric columns of a DataFrame.
    Outliers are defined as values more than 3 standard deviations from the mean.

    :param df: Pandas DataFrame
    :return: DataFrame with outliers replaced by mode
    """
    df_copy = df.copy()
    for col in df_copy.select_dtypes(include=[np.number]):
        mean = df_copy[col].mean()
        std = df_copy[col].std()
        col_mode = mode(df_copy[col])[0]  # Mode of the column
        outliers = (df_copy[col] - mean).abs() >= 3 * std
        df_copy.loc[outliers, col] = col_mode
    return df_copy

### 6. Mean

In [110]:
outlier_mean_df = replace_outliers_with_mean(dirty_df)
outlier_mean_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [111]:
outlier_mean_visuals = outlier_mean_df.recommendation

In [112]:
mse61 = []
rmse61 = []
ssim61 = []

rank = [0, 1, 5, 6, 3, 2, 8, 9, 4, 10, 12, 7, 11, 13, 14]
for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], outlier_mean_visuals['Correlation'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse61.append(mse_value)
  rmse61.append(rmse_value)
  ssim61.append(ssim_value)

MSE= 540.3897956251608, RMSE= 23.246285630723047, SSIM= (0.9513048530008796, 0.9518831290236542)

MSE= 454.26256595815653, RMSE= 21.313436277572805, SSIM= (0.9544658402421587, 0.9546201176158)

MSE= 599.1855870627572, RMSE= 24.47826764831934, SSIM= (0.9263591345574678, 0.92709423608935)

MSE= 599.1855870627572, RMSE= 24.47826764831934, SSIM= (0.9263591345574678, 0.92709423608935)

MSE= 632.9622453603234, RMSE= 25.15874093352693, SSIM= (0.9222663773913151, 0.9232308020337507)

MSE= 661.3553027243281, RMSE= 25.716829173215118, SSIM= (0.9202720179347603, 0.9212657176764356)

MSE= 428.24997588734567, RMSE= 20.694201503980423, SSIM= (0.945182911104629, 0.9461809288916527)

MSE= 427.5154697747878, RMSE= 20.676447223224493, SSIM= (0.9466508479573715, 0.947729991024049)

MSE= 526.4303468303916, RMSE= 22.94406997091823, SSIM= (0.9336406139478471, 0.9349386179521237)

MSE= 713.7439226064172, RMSE= 26.71598627425941, SSIM= (0.9139575943954275, 0.9151355714935823)

MSE= 620.1125277295524, RMSE= 24

In [113]:
mse62 = []
rmse62 = []
ssim62 = []

rank = [0, 3, 1, 17, 10, 7, 6, 2, 10, 8, 11, 12, 5, 9, 15, 4, 16, 13, 29, 21,
        18, 19, 31, 20, 26, 27, 22, 23, 24, 25, 28, 32, 30]
for i in range(33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], outlier_mean_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse62.append(mse_value)
  rmse62.append(rmse_value)
  ssim62.append(ssim_value)

MSE= 753.6436604315361, RMSE= 27.452571107849554, SSIM= (0.948109946425723, 0.9493135655424129)

MSE= 1370.0734747741449, RMSE= 37.0145035732501, SSIM= (0.9326560563625932, 0.9413434093884346)

MSE= 809.2778779959973, RMSE= 28.447809722296675, SSIM= (0.954523409985059, 0.9572796966060304)

MSE= 2567.009823394901, RMSE= 50.66566710697592, SSIM= (0.903734648422047, 0.9250463058712453)

MSE= 3056.7908940570346, RMSE= 55.28825276726544, SSIM= (0.8876229225250968, 0.9140608786351893)

MSE= 1551.4022475505562, RMSE= 39.38784390583669, SSIM= (0.9385191677326228, 0.949931021963621)

MSE= 1513.7296541943963, RMSE= 38.90667878648082, SSIM= (0.9272801835288286, 0.9339078749902053)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 2359.7362336837705, RMSE= 48.577116358258344, SSIM= (0.9072872801006745, 0.9256138542654789)

MSE= 2823.6954084482704, RMSE= 53.138455081496964, SSIM= (0.8917065533051453, 0.9140822831933304)

MSE= 4071.917416921859, RMSE= 63.81157745207259, SSIM= (0.8643241884393766, 0.90354

In [114]:
import json

outlier_mean_dict1 = {"mse": mse61, "rmse": rmse61, "ssim": ssim61}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/outlier_mean_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(outlier_mean_dict1, file)

outlier_mean_dict2 = {"mse": mse62, "rmse": rmse62, "ssim": ssim62}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/outlier_mean_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(outlier_mean_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


### 7. Median

In [115]:
outlier_median_df = replace_outliers_with_median(dirty_df)
outlier_median_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [116]:
outlier_median_visuals = outlier_median_df.recommendation

In [117]:
mse71 = []
rmse71 = []
ssim71 = []

rank = [0, 1, 5, 6, 3, 2, 8, 9, 4, 10, 14, 7, 11, 12, 13]
for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], outlier_median_visuals['Correlation'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse71.append(mse_value)
  rmse71.append(rmse_value)
  ssim71.append(ssim_value)

MSE= 540.3646144788452, RMSE= 23.24574400785755, SSIM= (0.9513052837623241, 0.9518831858311644)

MSE= 454.26256595815653, RMSE= 21.313436277572805, SSIM= (0.9544658402421587, 0.9546201176158)

MSE= 599.1855870627572, RMSE= 24.47826764831934, SSIM= (0.9263591345574678, 0.92709423608935)

MSE= 599.1855870627572, RMSE= 24.47826764831934, SSIM= (0.9263591345574678, 0.92709423608935)

MSE= 632.9622453603234, RMSE= 25.15874093352693, SSIM= (0.9222663773913151, 0.9232308020337507)

MSE= 661.3553027243281, RMSE= 25.716829173215118, SSIM= (0.9202720179347603, 0.9212657176764356)

MSE= 428.2485668041088, RMSE= 20.694167458588634, SSIM= (0.9451598791418433, 0.9461585284854941)

MSE= 427.37506128632975, RMSE= 20.673051571703915, SSIM= (0.9466349154342988, 0.9477140203739061)

MSE= 526.145665549447, RMSE= 22.937865322419324, SSIM= (0.9336756331825081, 0.9349726016124407)

MSE= 713.7439226064172, RMSE= 26.71598627425941, SSIM= (0.9139575943954275, 0.9151355714935823)

MSE= 620.0676636244534, RMSE= 2

In [118]:
mse72 = []
rmse72 = []
ssim72 = []

rank = [31, 2, 0, 15, 13, 6, 5, 1, 9, 7, 10, 11, 4, 8, 14, 3, 16, 12, 28, 19,
        17, 18, 30, 20, 25, 26, 21, 22, 23, 24, 27, 31, 29]
for i in range(1,33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], outlier_median_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse72.append(mse_value)
  rmse72.append(rmse_value)
  ssim72.append(ssim_value)

MSE= 1370.5773335523565, RMSE= 37.021309182042124, SSIM= (0.9326487779231207, 0.9413434093884345)

MSE= 822.1540524832016, RMSE= 28.673228846490268, SSIM= (0.9464693885706312, 0.9480374286371055)

MSE= 2567.868249461484, RMSE= 50.67413787585817, SSIM= (0.9037222481177606, 0.9250463058712453)

MSE= 3022.5389145588188, RMSE= 54.97762194346731, SSIM= (0.890846641044238, 0.9171852006263717)

MSE= 1551.7381534026974, RMSE= 39.39210775526866, SSIM= (0.9385143154396411, 0.9499310219636211)

MSE= 1514.513434516059, RMSE= 38.91675005079508, SSIM= (0.9272688615118712, 0.9339078749902054)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 2359.7362336837705, RMSE= 48.577116358258344, SSIM= (0.9072872801006745, 0.9256138542654789)

MSE= 2698.4731420195153, RMSE= 51.946829951591035, SSIM= (0.8967781944779204, 0.9180590682297638)

MSE= 4071.917416921859, RMSE= 63.81157745207259, SSIM= (0.8643241884393766, 0.903541475732726)

MSE= 4071.917416921859, RMSE= 63.81157745207259, SSIM= (0.8643241884393766, 0.903

In [119]:
import json

outlier_median_dict1 = {"mse": mse71, "rmse": rmse71, "ssim": ssim71}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/outlier_median_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(outlier_median_dict1, file)

outlier_median_dict2 = {"mse": mse72, "rmse": rmse72, "ssim": ssim72}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/outlier_median_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(outlier_median_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


### 8. Mode

In [120]:
outlier_mode_df = replace_outliers_with_mode(dirty_df)
outlier_mode_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [121]:
outlier_mode_visuals = outlier_mode_df.recommendation

In [122]:
mse81 = []
rmse81 = []
ssim81 = []

rank = [0, 1, 7, 8, 4, 3, 10, 11, 6, 13, 2, 5, 12, 9, 14]
for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], outlier_mode_visuals['Correlation'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse81.append(mse_value)
  rmse81.append(rmse_value)
  ssim81.append(ssim_value)

MSE= 540.0437463831018, RMSE= 23.238841330477342, SSIM= (0.9513098542258513, 0.9518893221281177)

MSE= 451.4864042305652, RMSE= 21.248209435869303, SSIM= (0.9544945230434383, 0.9546472120432911)

MSE= 597.0572233474794, RMSE= 24.43475441553443, SSIM= (0.9264236192707143, 0.9271553011488028)

MSE= 597.0572233474794, RMSE= 24.43475441553443, SSIM= (0.9264236192707143, 0.9271553011488028)

MSE= 631.0236007627636, RMSE= 25.12018313553394, SSIM= (0.9223231468464643, 0.9232800266305382)

MSE= 659.4166581267683, RMSE= 25.679109371759143, SSIM= (0.9203287873899093, 0.9213149422732235)

MSE= 425.75325721772117, RMSE= 20.633789211332978, SSIM= (0.9452341756840943, 0.9462210226427643)

MSE= 425.42834246600114, RMSE= 20.62591434254494, SSIM= (0.9466736822019033, 0.9477447790845297)

MSE= 523.2240143450199, RMSE= 22.8740904594045, SSIM= (0.9337228385186497, 0.9350111126012831)

MSE= 712.443632651749, RMSE= 26.691639752022525, SSIM= (0.9139921294849603, 0.9151679370373706)

MSE= 2802.0650411221227, 

In [123]:
mse82 = []
rmse82 = []
ssim82 = []

rank = [31, 2, 0, 12, 16, 6, 3, 1, 11, 7, 8, 9, 5, 10, 15, 4, 14, 13, 27, 19,
        17, 18, 31, 20, 25, 26, 23, 24, 21, 22, 29, 30, 28]
for i in range(1,33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], outlier_mode_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse82.append(mse_value)
  rmse82.append(rmse_value)
  ssim82.append(ssim_value)

MSE= 1309.7763480983153, RMSE= 36.190832376422556, SSIM= (0.9347301916210556, 0.9425258640697745)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 2287.7871987927597, RMSE= 47.83081850431539, SSIM= (0.9168707394711213, 0.9358131646569896)

MSE= 3014.7439060289676, RMSE= 54.90668361892719, SSIM= (0.8919426874934737, 0.9183583977883163)

MSE= 1553.025792502572, RMSE= 39.408448237688475, SSIM= (0.9384834662364019, 0.9499160999597247)

MSE= 1190.3209354102366, RMSE= 34.501028034106994, SSIM= (0.9357197435777493, 0.9407745856878095)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0)

MSE= 2359.7362336837705, RMSE= 48.577116358258344, SSIM= (0.9072872801006745, 0.9256138542654789)

MSE= 2698.4731420195153, RMSE= 51.946829951591035, SSIM= (0.8967781944779204, 0.9180590682297638)

MSE= 4052.9746643819926, RMSE= 63.66297718754592, SSIM= (0.8639905315381438, 0.9028515244356357)

MSE= 4052.9746643819926, RMSE= 63.66297718754592, SSIM= (0.8639905315381438, 0.9028515244356357)

MSE= 0.0, RMSE= 0.0, SSIM= (1.0, 1.0

In [124]:
import json

outlier_mode_dict1 = {"mse": mse81, "rmse": rmse81, "ssim": ssim81}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/outlier_mode_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(outlier_mode_dict1, file)

outlier_mode_dict2 = {"mse": mse82, "rmse": rmse82, "ssim": ssim82}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/outlier_mode_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(outlier_mode_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


### 9. HoloClean

In [125]:
outlier_hc_df = pd.read_csv(DATA_PATH + "Holoclean_outlier_clean.csv")
outlier_hc_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [126]:
outlier_hc_visuals = outlier_hc_df.recommendation

In [127]:
mse91 = []
rmse91 = []
ssim91 = []

rank = [0, 1, 8, 3, 2, 6, 7, 5, 4, 9, 13, 10, 11, 12, 14]
for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], outlier_hc_visuals['Correlation'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse91.append(mse_value)
  rmse91.append(rmse_value)
  ssim91.append(ssim_value)

MSE= 570.4986680270223, RMSE= 23.88511394209838, SSIM= (0.9483543648162842, 0.9489992880576481)

MSE= 515.3083104263118, RMSE= 22.700403309772092, SSIM= (0.9400762702266223, 0.9404125971187292)

MSE= 684.1393540621785, RMSE= 26.156057693432672, SSIM= (0.9136015606790513, 0.9146789875796206)

MSE= 685.5593010545267, RMSE= 26.18318737385742, SSIM= (0.9141709396485139, 0.9152558175839516)

MSE= 680.6610830801504, RMSE= 26.08948223097098, SSIM= (0.910714489107084, 0.9117025583127955)

MSE= 713.4333247432003, RMSE= 26.710172682766398, SSIM= (0.9096173665481484, 0.9106602165253275)

MSE= 435.39502028477045, RMSE= 20.866121352200807, SSIM= (0.9341894984418124, 0.9352109674684662)

MSE= 434.20998932010355, RMSE= 20.837705951474206, SSIM= (0.9370661490434192, 0.9381956307663546)

MSE= 544.4861206557034, RMSE= 23.334226377913268, SSIM= (0.9270314131242602, 0.9285261549329826)

MSE= 733.4732492705922, RMSE= 27.082711261441165, SSIM= (0.9029654599336573, 0.904252427759891)

MSE= 659.9903097270447,

In [128]:
mse92 = []
rmse92 = []
ssim92 = []

rank = [0, 1, 2, 8, 5, 4, 3, 10, 5, 7, 23, 22, 6, 27, 15, 11, 29, 9, 26, 12,
        14, 25, 30, 17, 13, 16, 20, 19, 21, 31, 24, 28, 32]
for i in range(33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], outlier_hc_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse92.append(mse_value)
  rmse92.append(rmse_value)
  ssim92.append(ssim_value)

MSE= 742.2349893703382, RMSE= 27.243989967887195, SSIM= (0.946450734573672, 0.9473523852058703)

MSE= 1341.3839809590406, RMSE= 36.62490929625684, SSIM= (0.9327022147967639, 0.9406758398368952)

MSE= 1464.7257333763343, RMSE= 38.27173543721704, SSIM= (0.9303005029293383, 0.9401468522975875)

MSE= 2593.002815654739, RMSE= 50.92153587289703, SSIM= (0.9044112697606689, 0.9264900036500857)

MSE= 3362.561702021846, RMSE= 57.98759955388606, SSIM= (0.8822530914409884, 0.9124426843343432)

MSE= 1857.0924891091179, RMSE= 43.09399597518334, SSIM= (0.9228325757027653, 0.9361139762531969)

MSE= 1380.150325068721, RMSE= 37.15037449432672, SSIM= (0.930277243450078, 0.9375115566061671)

MSE= 2322.67276139122, RMSE= 48.194115422852406, SSIM= (0.9055682028031479, 0.921542593067976)

MSE= 2226.1898246105807, RMSE= 47.182516090291124, SSIM= (0.9066297709510707, 0.922524464480813)

MSE= 2676.6688923148954, RMSE= 51.736533439291186, SSIM= (0.8997770449390163, 0.9215339824647887)

MSE= 4681.697910136156, RM

In [129]:
import json

outlier_hc_dict1 = {"mse": mse91, "rmse": rmse91, "ssim": ssim91}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/outlier_hc_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(outlier_hc_dict1, file)

outlier_hc_dict2 = {"mse": mse92, "rmse": rmse92, "ssim": ssim92}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/outlier_hc_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(outlier_hc_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


## Duplicates

Finally, we need to deal with duplicates in the Airbnb dataset. For duplicates that have the same value, we just used `df.drop_duplicates()` to remove duplicates. For duplicates that have alias name, we used the `fuzzywuzzy` library to detect similarities in a particular columns and merge duplicated records.

### 10. Drop Same Value

In [130]:
dirty_df.duplicated().sum()

2212

In [131]:
no_duplicates_df = dirty_df.drop_duplicates()
no_duplicates_df

Button(description='Toggle Pandas/Lux', layout=Layout(top='5px', width='140px'), style=ButtonStyle())

Output()

In [132]:
no_duplicates_visuals = no_duplicates_df.recommendation

In [133]:
no_duplicates_df.duplicated().sum()

0

In [134]:
mse101 = []
rmse101 = []
ssim101 = []

for i in range(15):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Correlation'][i], no_duplicates_visuals['Correlation'][i])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse101.append(mse_value)
  rmse101.append(rmse_value)
  ssim101.append(ssim_value)

MSE= 0.08273579161844136, RMSE= 0.28763829998531376, SSIM= (0.9999900148007003, 0.99999575824377)

MSE= 0.29813111858603397, RMSE= 0.5460138446834787, SSIM= (0.9999631249328532, 0.9999774291795496)

MSE= 0.5187661755722737, RMSE= 0.7202542437030647, SSIM= (0.999871255770812, 0.9999038402167234)

MSE= 0.5187661755722737, RMSE= 0.7202542437030647, SSIM= (0.999871255770812, 0.9999038402167234)

MSE= 0.5337825822241512, RMSE= 0.7306042582849838, SSIM= (0.9998588616960834, 0.9998973115787437)

MSE= 0.5337825822241512, RMSE= 0.7306042582849838, SSIM= (0.9998588616960834, 0.9998973115787437)

MSE= 0.6385879830568416, RMSE= 0.7991170021072268, SSIM= (0.9998064852081007, 0.9998519577711007)

MSE= 0.6389436447080762, RMSE= 0.7993395052842541, SSIM= (0.999804442158764, 0.9998489474009123)

MSE= 0.7743608137217078, RMSE= 0.8799777347874819, SSIM= (0.9998150586294086, 0.9998678901518876)

MSE= 0.6984340840406379, RMSE= 0.8357236888114623, SSIM= (0.9997559128383444, 0.9997944832213939)

MSE= 0.85878

In [135]:
mse102 = []
rmse102 = []
ssim102 = []

rank = [0, 2, 1, 3, 4, 5, 7, 6, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
        20, 22, 23, 21, 27, 28, 24, 25, 30, 26, 29, 31, 32]
for i in range(33):
  print(f"============================ image_rank_in_dirty_dataset = {i} ============================")
  mse_value, rmse_value, ssim_value = compute_metrics_between_two_visuals(dirty_visuals['Distribution'][i], no_duplicates_visuals['Distribution'][rank[i]])
  print(f"MSE= {mse_value}, RMSE= {rmse_value}, SSIM= {ssim_value}" + "\n")
  mse102.append(mse_value)
  rmse102.append(rmse_value)
  ssim102.append(ssim_value)

MSE= 469.544971858523, RMSE= 21.66898640588717, SSIM= (0.964198942277625, 0.9642353504531062)

MSE= 460.2592225879308, RMSE= 21.45365289613708, SSIM= (0.9651995175341118, 0.9652285289328626)

MSE= 475.54386996045525, RMSE= 21.806968380782674, SSIM= (0.9638278579975148, 0.9638860544715577)

MSE= 456.34615624196243, RMSE= 21.362260092086757, SSIM= (0.9664808503303762, 0.9665171795286028)

MSE= 456.2529939879115, RMSE= 21.360079447134822, SSIM= (0.9682808866213044, 0.968504328931045)

MSE= 408.28014006438076, RMSE= 20.205943186705756, SSIM= (0.9703463534187781, 0.9705685641693489)

MSE= 438.4403706617316, RMSE= 20.93896775540121, SSIM= (0.9669707648536459, 0.9670263190143402)

MSE= 433.01766025872877, RMSE= 20.809076391294468, SSIM= (0.9708464634384237, 0.971277044164067)

MSE= 429.86762152777777, RMSE= 20.73324917922364, SSIM= (0.9688111089065884, 0.9689596805017671)

MSE= 410.7154239607446, RMSE= 20.266115166966376, SSIM= (0.9696622124369279, 0.9696850542118549)

MSE= 374.9218511385192,

In [136]:
import json

no_duplicates_dict1 = {"mse": mse101, "rmse": rmse101, "ssim": ssim101}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/no_duplicates_metrics1.json'
with open(file_path, 'w') as file:
    json.dump(no_duplicates_dict1, file)

no_duplicates_dict2 = {"mse": mse102, "rmse": rmse102, "ssim": ssim102}
file_path = '/content/drive/MyDrive/LuxVisualization/metrics_airbnb/no_duplicates_metrics2.json'
with open(file_path, 'w') as file:
    json.dump(no_duplicates_dict2, file)

print("Dictionary saved to Google Drive successfully.")

Dictionary saved to Google Drive successfully.


### 11. Merge Alias Name

In [19]:
!pip install fuzzywuzzy
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.23.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.23.0 (from python-Levenshtein)
  Downloading Levenshtein-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.23.0->python-Levenshtein)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.23.0 python-Levenshtein-0.23.0 rapidfuzz-3.5.2


In [40]:
process.extract('San Diego', dirty_df['LocationName'].unique(), scorer=fuzz.token_sort_ratio)

[('San Diego', 100),
 ('SAN DIEGO', 100),
 ('San diego', 100),
 ('San Mateo', 67),
 ('Gas lamp San Diego', 67)]

In [140]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

def repair_duplicates(df, column, threshold=90):
    df_copy = df.copy()

    # Get unique names in the DataFrame
    unique_names = df_copy[column].unique()
    for i in range(len(unique_names)):
        # Find similar names in the DataFrame
        matches = process.extract(unique_names[i], unique_names, scorer=fuzz.token_sort_ratio)

        # Loop through matches and merge duplicates
        for potential_match in matches:
            if potential_match[1] >= threshold and potential_match[0] != unique_names[i]:
                # Replace the duplicate with the original name
                df_copy[column] = df_copy[column].replace(potential_match[0], unique_names[i])

    return df_copy

In [47]:
process.extract('Brooklyn', unique_names, scorer=fuzz.token_sort_ratio)

[('Brooklyn', 100),
 ('brooklyn', 100),
 ('Brookyln', 88),
 ('Brookline', 82),
 ('Brooklyn Heights', 67)]

In [None]:
df_copy = dirty_df
column = 'LocationName'
threshold = 80
unique_names = df_copy[column].unique()
# for name in unique_names:
for i in range(len(unique_names)):
    # Find similar names in the DataFrame
    # matches = process.extract(name, unique_names, scorer=fuzz.token_sort_ratio)
    matches = process.extract(unique_names[i], unique_names, scorer=fuzz.token_sort_ratio)

    # Loop through matches and merge duplicates
    for potential_match in matches:
        if potential_match[1] >= threshold and potential_match[0] != unique_names[i]:
            # Replace the duplicate with the original name
            df_copy[column] = df_copy[column].replace(potential_match[0], unique_names[i])

In [None]:
alias_repaired_df = repair_duplicates(dirty_df, 'LocationName', threshold=80)
alias_repaired_df