In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def encode_cat_cols(df):
  label_dict = {}
  for col in df.columns:
    sample = np.nan
    for i in range(len(df[col])):
      if df[col][i]!=df[col][i]: continue
      else:
        sample = df[col][i]
        break
    if isinstance(sample, str):
      label_dict[col] = {}
      le = LabelEncoder()
      le.fit(list(df[col]))
      labels = list(le.classes_)
      for i in range(len(labels)):
        label_dict[col][labels[i]] = i

  return label_dict

In [None]:
def load_data(filename):
  train = pd.read_csv(f'./data_to_eval/train/{filename}.csv') # train dataset
  label_dictionary = encode_cat_cols(train)

  test = pd.read_csv(f'./data_to_eval/test/{filename}.csv') # test dataset

  generated_great = pd.read_csv(f'./data_to_eval/great/{filename}.csv')

  generated_paft = pd.read_csv(f'./data_to_eval/paft/{filename}.csv')

  return train, test, generated_great, generated_paft, label_dictionary

## Evaluation - ML Efficieny

In [None]:
# train with generated data and test with ground truth
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error, mean_absolute_percentage_error

def MLE(train_df, test_df, label_col, label_col_discrete):
  X_train = train_df.copy().drop([label_col], axis=1)
  y_train = train_df[label_col]
  X_test = test_df.copy().drop([label_col], axis=1)
  y_test = test_df[label_col]

  if label_col_discrete:
    RF = RandomForestClassifier()
    RF.fit(X_train, y_train)
    print("  -> Random Forest")
    y_pred = RF.predict(X_test)
    print('       Training Score: ', RF.score(X_train, y_train))
    print('       Testing Score: ', RF.score(X_test, y_test))
    print('       Accuracy:', accuracy_score(y_test, y_pred))
    
    LR = LogisticRegression()
    LR.fit(X_train, y_train)
    print("  -> Logistic Regression")
    y_pred = LR.predict(X_test)
    print('       Training Score: ', LR.score(X_train, y_train))
    print('       Testing Score: ', LR.score(X_test, y_test))
    print('       Accuracy:', accuracy_score(y_test, y_pred))

    NN = MLPClassifier(solver='adam', hidden_layer_sizes=(150, 100, 50), max_iter=300, activation='relu')
    NN.fit(X_train, y_train)
    print("  -> Neural Network")
    y_pred = NN.predict(X_test)
    print('       Training Score: ', NN.score(X_train, y_train))
    print('       Testing Score: ', NN.score(X_test, y_test))
    print('       Accuracy:', accuracy_score(y_test, y_pred))
  else:
    RF = RandomForestRegressor()
    RF.fit(X_train, y_train)
    print("  -> Random Forest")
    y_pred = RF.predict(X_test)
    print('       Training Score: ', RF.score(X_train, y_train))
    print('       Testing Score: ', RF.score(X_test, y_test))
    print('       Accuracy (r2 score):', r2_score(y_test, y_pred))
    print('       Mean Squared Error:', mean_squared_error(y_test, y_pred))
    print('       Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))

    LR = LinearRegression()
    LR.fit(X_train, y_train)
    print("  -> Linear Regression")
    y_pred = LR.predict(X_test)
    print('       Training Score: ', LR.score(X_train, y_train))
    print('       Testing Score: ', LR.score(X_test, y_test))
    print('       Accuracy (r2 score):', r2_score(y_test, y_pred))
    print('       Mean Squared Error:', mean_squared_error(y_test, y_pred))
    print('       Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))

    NN = MLPRegressor(solver='adam', hidden_layer_sizes=(150, 100, 50), max_iter=300, activation='relu')
    NN.fit(X_train, y_train)
    print("  -> Neural Network")
    y_pred = NN.predict(X_test)
    print('       Training Score: ', NN.score(X_train, y_train))
    print('       Testing Score: ', NN.score(X_test, y_test))
    print('       Accuracy:', r2_score(y_test, y_pred))
    print('       Mean Squared Error:', mean_squared_error(y_test, y_pred))
    print('       Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))

## Evaluation - Discriminator

In [None]:
# train with ground truth + random data (as different as possible), then test generated data to see if its real/fake
from sklearn.utils import shuffle

def Discriminator(real_data_train, real_data_test, generated_data):
  real_test = real_data_test.copy()
  real_test['truth'] = 1
  generated_test = generated_data.copy()
  generated_test['truth'] = 0
  generated_test = generated_test.sample(len(real_test))

  X_test = shuffle(pd.concat([generated_test, real_test])).reset_index(drop=True)
  y_test = X_test['truth']
  X_test = X_test.drop(['truth'], axis=1)

  real_train = real_data_train.copy()
  real_train['truth'] = 1
  random = {}
  float_cols = []
  for col in list(real_data_train.columns):
    random[col] = np.random.choice(real_data_train[col], len(real_data_train))
  random = pd.DataFrame(random)
  for col in float_cols:
    random[col] = random[col].astype(float)
  random['truth'] = 0

  X_train = shuffle(pd.concat([real_train, random])).reset_index(drop=True)
  y_train = X_train['truth']
  X_train = X_train.drop(['truth'], axis=1)

  RF = RandomForestClassifier()
  RF.fit(X_train, y_train)
  print("  -> Random Forest")
  y_pred = RF.predict(X_test)
  print('       Accuracy (1 means generated data is close to real-world):', 1-accuracy_score(y_test, y_pred))

  LR = LogisticRegression()
  LR.fit(X_train, y_train)
  print("  -> Logistic Regression")
  y_pred = LR.predict(X_test)
  print('       Accuracy:', 1-accuracy_score(y_test, y_pred))

  NN = MLPClassifier(solver='adam', hidden_layer_sizes=(150, 100, 50), max_iter=300,activation='relu')
  NN.fit(X_train, y_train)
  print("  -> Neural Network")
  y_pred = NN.predict(X_test)
  print('       Accuracy:', 1-accuracy_score(y_test, y_pred))


## Applying Datasets

### Adult

In [None]:
# minor class distribution
print(f"\n-------------------- Adult Dataset ---------------------")
train, test, generated_great, generated_paft, label_dictionary = load_data('adult')

print("Test dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(test[col])))
  extra = list(set(test[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\nGreat dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_great[col])))
  extra = list(set(generated_great[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\npaft dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_paft[col])))
  extra = list(set(generated_paft[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

train.replace(label_dictionary, inplace=True)
test.replace(label_dictionary, inplace=True)
generated_great.replace(label_dictionary, inplace=True)
generated_paft.replace(label_dictionary, inplace=True)

In [None]:
train = train.dropna().reset_index(drop=True) # train has nan values in "workclass", "occupation", and "native-country"
test = test.dropna().reset_index(drop=True) # test has nan values in "workclass", "occupation", and "native-country"
# generated_great = generated_great.replace({ # generated_great had values not in real-data
#   'workclass': {'Self-emp-not': 5, 'Self-emp-not-': 5,'Self-emp-': 4, 'Self-': 4, 'Local-': 1, 'Self-emp': 4},
#   'marital-status': {'Married-civ-sp': 2, 'Married-civ-': 2, 'Married-civ': 2, 'Divor': 0, 'Married-': 1, 'Never': 4, 'Married-c': 2, 'Separ': 5, 'Div': 0, 'Never-': 4, 'Married-spouse-': 3, 'Married-spspouse-absent': 3, 'Not-in-family': np.nan},
#   'education': {'Some-': 15, '10': 0, 'B': 9, 'HS': 11, 'HS-': 11, 'Doctor': 10, 'Prof-': 15, 'Assoc-ac': 7, 'Bachel': 9, '7688': np.nan, '7th-': 5, 'Assoc': 7, 'Prof': 14, '9': 6, '7': 5, 'Some': 15, '1st-': 3},
#   'occupation': {'Machine-op-ins': 6, 'Prof-special': 9, 'Machine-op-insp': 6, 'Machine-op-': 6, 'Exec-manager': 3, 'Farming-': 4, 'Craft': 2, 'Adm-cler': 0, 'Machine-op': 6, 'Exec': 3, 'Other-': 7, 'Farming-f': 4, 'Handlers-': 5, 'Transport-': 6, 'Machine': 6, 'Adm-': 0, 'Exec-': 3, 'Adm': 0, 'Craft-': 2, 'Farming': 4, 'Ad': 0, 'Prof-': 9, 'Prof': 9},
#   'relationship': {'Hus': 0, 'Own': 3, 'Not-in-': 1, 'Not-in': 1, 'Own-': 3},
#   'race': {'Amer-Indian-E': 0, 'Asian': 1, 'Asian-Pac-Is': 1, 'Asian-Pac-': 1, 'Asian-': 1, 'Asian-Pac': 1, 'Jamaica': 3},
#   'income': {'>': 1, '<=': 0, '<=50': 0, '>50': 1}
# })
generated_great = generated_great.dropna().reset_index(drop=True) # generated_great has nan values in 'workclass', 'occupation', 'education', 'marital-status'
generated_paft = generated_paft[list(train.columns)] # re-order generated_paft's columns
# generated_paft = generated_paft.replace({
#   'workclass': {'State-service': np.nan},
#   'education': {'6th-8th': np.nan},
#   'marital-status': {'Private': np.nan},
#   'occupation': {'Own-child': np.nan, 'Othermaiden': np.nan, 'Iran': np.nan, 'Prof-specialty deals': 9}
# })
generated_paft = generated_paft.dropna().reset_index(drop=True) # generated_great has nan values in 'workclass', 'occupation', 'native-country', 'marital-status'

print("\nMachine Learning Effienciency:")
print(" - Real data:")
MLE(train, test, 'income', label_col_discrete=True)

print(" - Great method:")
MLE(generated_great, test, 'income', label_col_discrete=True)

print(" - paft method:")
MLE(generated_paft, test, 'income', label_col_discrete=True)

print("\nDiscriminator")

# print(" - Real data:")
# Discriminator(train, test, test)

print(" - Great method:")
Discriminator(train, test, generated_great)

print(" - paft method:")
Discriminator(train, test, generated_paft)

In [None]:
# knowledge check (education <-> education-num)
education_dict = dict(zip(train['education'], train['education-num']))

wrong = 0
for i in range(len(test['education'])):
  if education_dict[test['education'][i]] != test['education-num'][i]:
    wrong += 1
print(f"{wrong/len(test['education'])*100} error (%) in test.")

wrong = 0
for i in range(len(generated_great['education'])):
  if education_dict[generated_great['education'][i]] != generated_great['education-num'][i]:
    wrong += 1
print(f"{wrong/len(generated_great['education'])*100} error (%) in generated_great.")

wrong = 0
for i in range(len(generated_paft['education'])):
  if education_dict[generated_paft['education'][i]] != generated_paft['education-num'][i]:
    wrong += 1
print(f"{wrong/len(generated_paft['education'])*100} error (%) in generated_paft.")

### California Housing

In [None]:
# minor class distribution
print(f"\n-------------------- Adult Dataset ---------------------")
train, test, generated_great, generated_paft, label_dictionary = load_data('california_housing')

print("Test dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(test[col])))
  extra = list(set(test[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\nGreat dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_great[col])))
  extra = list(set(generated_great[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\npaft dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_paft[col])))
  extra = list(set(generated_paft[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

train.replace(label_dictionary, inplace=True)
test.replace(label_dictionary, inplace=True)
generated_great.replace(label_dictionary, inplace=True)
generated_paft.replace(label_dictionary, inplace=True)

In [None]:
print(f"\n-------------------- California Housing Dataset ---------------------")

train = train.dropna().reset_index(drop=True) # train has nan values in "total_bedrooms"
test = test.dropna().reset_index(drop=True) # test has nan values in "total_bedrooms"
generated_great = generated_great.dropna().reset_index(drop=True) # generated_great has nan values in "total_bedrooms"
generated_paft = generated_paft[list(train.columns)] # re-order generated_paft's columns
# generated_paft['ocean_proximity'] = generated_paft['ocean_proximity'].replace({'NEAR OCEAN OCEAN': 4, 'NEAR BAY BAY': 3, 'is <1H OCEAN': 0}) # generated_paft had values not in real-data

print("\nMachine Learning Effienciency:")
print(" - Real data:")
MLE(train, test, 'median_house_value', label_col_discrete=False)

print(" - Great method:")
MLE(generated_great, test, 'median_house_value', label_col_discrete=False)

print(" - paft method:")
MLE(generated_paft, test, 'median_house_value', label_col_discrete=False)

print("\nDiscriminator")
# print(" - Real data:")
# Discriminator(train, test, test)

print(" - Great method:")
Discriminator(train, test, generated_great)

print(" - paft method:")
Discriminator(train, test, generated_paft)

In [None]:
# knowledge check (long, lat <-> in CA?)
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame

test_tmp = test[(test['longitude']<=-110) & (test['longitude']>=-130) & (test['latitude']>=30) & (test['latitude']<=45)].copy()
geometry = [Point(xy) for xy in zip(test_tmp['longitude'], test_tmp['latitude'])]
gdf = GeoDataFrame(test_tmp, geometry=geometry)   

#this is a simple map that goes with geopandas
world = gpd.read_file("./US_map/cb_2018_us_state_20m.shp")
world = world.to_crs("EPSG:4326")
world['STUSPS'].unique()

ax=world.loc[world['STUSPS'] == 'CA'].plot(figsize=(15, 15), alpha=0.2)
gdf.plot(ax=ax, marker='o', color='yellow', markersize=1,legend=True)

generated_great_tmp = generated_great[(generated_great['longitude']<=-110) & (generated_great['longitude']>=-130) & (generated_great['latitude']>=30) & (generated_great['latitude']<=45)].copy()
geometry = [Point(xy) for xy in zip(generated_great_tmp['longitude'], generated_great_tmp['latitude'])]
gdf = GeoDataFrame(generated_great_tmp, geometry=geometry)
gdf.plot(ax=ax, marker='^', color='blue', markersize=25,legend=True)

generated_paft_tmp = generated_paft[(generated_paft['longitude']<=-110) & (generated_paft['longitude']>=-130) & (generated_paft['latitude']>=30) & (generated_paft['latitude']<=45)].copy()
geometry = [Point(xy) for xy in zip(generated_paft_tmp['longitude'], generated_paft_tmp['latitude'])]
gdf = GeoDataFrame(generated_paft_tmp, geometry=geometry)
gdf.plot(ax=ax, marker='x', color='red', markersize=25, legend=True)

ax.legend(['Original', 'Generated - GReaT', 'Generated - PAFT'], loc='upper right') 

ax.set_title("Houses in California")

### US Geo-location

In [None]:
# minor class distribution
print(f"\n-------------------- Geo Location Dataset ---------------------")
train, test, generated_great, generated_paft, label_dictionary = load_data('geo_bird_latlevel')

print("Test dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(test[col])))
  extra = list(set(test[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\nGreat dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_great[col])))
  extra = list(set(generated_great[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\npaft dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_paft[col])))
  extra = list(set(generated_paft[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

train.replace(label_dictionary, inplace=True)
test.replace(label_dictionary, inplace=True)
generated_great.replace(label_dictionary, inplace=True)
generated_paft.replace(label_dictionary, inplace=True)

In [None]:
print(f"\n-------------------- Geo Bird Dataset ---------------------")
generated_paft = generated_paft[list(train.columns)] # re-order generated_paft's columns

print("\nMachine Learning Effienciency:")
print(" - Real data:")
MLE(train, test, 'bird', label_col_discrete=True)

print(" - Great method:")
MLE(generated_great, test, 'bird', label_col_discrete=True)

print(" - paft method:")
MLE(generated_paft, test, 'bird', label_col_discrete=True)

print("\nDiscriminator")
# print(" - Real data:")
# Discriminator(train, test, test)

print(" - Great method:")
Discriminator(train, test, generated_great)

print(" - paft method:")
Discriminator(train, test, generated_paft)

In [None]:
# knowledge check (state_code <-> bird)
train, test, generated_great, generated_paft, label_dictionary = load_data('geo_bird_latlevel')

bird_dicts = dict(zip(train['state_code'], train['bird']))

wrong = 0
for i in range(len(test['state_code'])):
  if bird_dicts[test['state_code'][i]] != test['bird'][i]:
    wrong += 1
print(f"{wrong/len(test['state_code'])*100} error (%) in test.")

wrong = 0
for i in range(len(generated_great['state_code'])):
  if bird_dicts[generated_great['state_code'][i]] != generated_great['bird'][i]:
    wrong += 1
print(f"{wrong/len(generated_great['state_code'])*100} error (%) in test.")

wrong = 0
for i in range(len(generated_paft['state_code'])):
  if bird_dicts[generated_paft['state_code'][i]] != generated_paft['bird'][i]:
    wrong += 1
print(f"{wrong/len(generated_paft['state_code'])*100} error (%) in test.")

### Bejing

In [None]:
# minor class distribution
print(f"\n-------------------- Bejing Dataset ---------------------")
train, test, generated_great, generated_paft, label_dictionary = load_data('bejing')

print("Test dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(test[col])))
  extra = list(set(test[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\nGreat dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_great[col])))
  extra = list(set(generated_great[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\npaft dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_paft[col])))
  extra = list(set(generated_paft[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

train.replace(label_dictionary, inplace=True)
test.replace(label_dictionary, inplace=True)
generated_great.replace(label_dictionary, inplace=True)
generated_paft.replace(label_dictionary, inplace=True)

In [None]:
print(f"\n-------------------- Bejing Dataset ---------------------")

train['pm2.5'].fillna(-1, inplace=True)
test['pm2.5'].fillna(-1, inplace=True)
generated_great['pm2.5'].fillna(-1, inplace=True)
# generated_paft = generated_paft.replace({'cbwd': {'cv.8': 3}})

generated_paft = generated_paft[list(train.columns)] # re-order generated_paft's columns

print("\nMachine Learning Effienciency:")
print(" - Real data:")
MLE(train, test, 'pm2.5', label_col_discrete=False)

print(" - Great method:")
MLE(generated_great, test, 'pm2.5', label_col_discrete=False)

print(" - paft method:")
MLE(generated_paft, test, 'pm2.5', label_col_discrete=False)

print("\nDiscriminator")
# print(" - Real data:")
# Discriminator(train, test, test)

print(" - Great method:")
Discriminator(train, test, generated_great)

print(" - paft method:")
Discriminator(train, test, generated_paft)

### Seattle Housing

In [None]:
# minor class distribution
print(f"\n-------------------- Seattle Housing Dataset ---------------------")
train, test, generated_great, generated_paft, label_dictionary = load_data('seattle_housing')

print("Test dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(test[col])))
  extra = list(set(test[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\nGreat dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_great[col])))
  extra = list(set(generated_great[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\npaft dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  if col not in generated_paft: continue
  lack = list(set(label_dictionary[col].keys()- set(generated_paft[col])))
  extra = list(set(generated_paft[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

train.replace(label_dictionary, inplace=True)
test.replace(label_dictionary, inplace=True)
generated_great.replace(label_dictionary, inplace=True)
generated_paft.replace(label_dictionary, inplace=True)

In [None]:
print(f"\n-------------------- Seattle Housing Dataset ---------------------")

train.drop(columns=['lot_size', 'lot_size_units'], inplace=True)
test.drop(columns=['lot_size', 'lot_size_units'], inplace=True)
generated_great.drop(columns=['lot_size', 'lot_size_units'], inplace=True)
# generated_great = generated_great.replace({'size_units': {'779.0': 0}})
generated_paft = generated_paft[list(train.columns)] # re-order generated_paft's columns

print("\nMachine Learning Effienciency:")
print(" - Real data:")
MLE(train, test, 'price', label_col_discrete=False)

print(" - Great method:")
MLE(generated_great, test, 'price', label_col_discrete=False)

print(" - paft method:")
MLE(generated_paft, test, 'price', label_col_discrete=False)

print("\nDiscriminator")
# print(" - Real data:")
# Discriminator(train, test, test)

print(" - Great method:")
Discriminator(train, test, generated_great)

print(" - paft method:")
Discriminator(train, test, generated_paft)

In [None]:
# knowledge check (state_code <-> bird)

zip_code = range(98033, 98199+1)

wrong = 0
for i in range(len(test['zip_code'])):
  if test['zip_code'][i] not in zip_code:
    wrong += 1
print(f"{wrong/len(test['zip_code'])*100} error (%) in test.")

wrong = 0
for i in range(len(generated_great['zip_code'])):
  if generated_great['zip_code'][i] not in zip_code:
    wrong += 1
print(f"{wrong/len(generated_great['zip_code'])*100} error (%) in great.")

wrong = 0
for i in range(len(generated_paft['zip_code'])):
  if generated_paft['zip_code'][i] not in zip_code:
    wrong += 1
print(f"{wrong/len(generated_paft['zip_code'])*100} error (%) in paft.")

### Travel

In [None]:
# minor class distribution
print(f"\n-------------------- Travel Dataset ---------------------")
train, test, generated_great, generated_paft, label_dictionary = load_data('travel')

print("Test dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(test[col])))
  extra = list(set(test[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\nGreat dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  lack = list(set(label_dictionary[col].keys()- set(generated_great[col])))
  extra = list(set(generated_great[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

print("\n\npaft dataset")
total_lack = 0
total_extra = 0
for col in label_dictionary.keys():
  if col not in generated_paft: continue
  lack = list(set(label_dictionary[col].keys()- set(generated_paft[col])))
  extra = list(set(generated_paft[col]) - set(label_dictionary[col].keys()))
  if len(lack) or len(extra):
    print(f"In {col} -> missed: {len(lack)} values, added: {len(extra)} values.")
  total_lack += len(lack)
  total_extra += len(extra)
if total_lack or total_extra:
  print(f"=> Total missed: {total_lack} values, added: {total_extra} values.")

train.replace(label_dictionary, inplace=True)
test.replace(label_dictionary, inplace=True)
generated_great.replace(label_dictionary, inplace=True)
generated_paft.replace(label_dictionary, inplace=True)

In [None]:
print(f"\n-------------------- Travel Dataset ---------------------")

train.drop(columns=['FrequentFlyer'], inplace=True)
test.drop(columns=['FrequentFlyer'], inplace=True)
generated_great.drop(columns=['FrequentFlyer'], inplace=True)
generated_paft = generated_paft[list(train.columns)] # re-order generated_paft's columns

print("\nMachine Learning Effienciency:")
print(" - Real data:")
MLE(train, test, 'Target', label_col_discrete=True)

print(" - Great method:")
MLE(generated_great, test, 'Target', label_col_discrete=True)

print(" - paft method:")
MLE(generated_paft, test, 'Target', label_col_discrete=True)

print("\nDiscriminator")
# print(" - Real data:")
# Discriminator(train, test, test)

print(" - Great method:")
Discriminator(train, test, generated_great)

print(" - paft method:")
Discriminator(train, test, generated_paft)