In [3]:
import subprocess
import pandas as pd
import chardet
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import argparse
import hashlib
from datetime import datetime
from configparser import ConfigParser
import sys
import warnings
import re
warnings.filterwarnings('ignore')
from numpy import random
from ydata_profiling import ProfileReport

In [4]:
#!pip install chardet

In [5]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# **Functions**

In [6]:
def col_eg(df, col):
   return df[(~df[col].isna()) & (df[col]!="nan")][col][1:10]

In [7]:
def convert_to_snake_case(col_name):
    # Remove '__c' from the column names
    col_name = col_name.replace('__c', '')
    
    # Replace periods with underscores
    col_name = col_name.replace('.', '_')
    
    # Convert camel case to snake case
    # Handle the case where a lowercase letter or digit is followed by an uppercase letter
    col_name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', col_name)
    
    # Convert the entire string to lowercase
    col_name = col_name.lower()
    
    # Replace multiple underscores with a single underscore
    col_name = re.sub(r'_+', '_', col_name)
    
    return col_name

In [8]:
# Define a function to convert entries to datetime
def convert_to_datetime(entry):
    """
    Convert a given entry into a datetime object using pandas' to_datetime function.

    Args:
        entry (str): The entry to convert to datetime.

    Returns:
        pd.Timestamp: Returns a pandas Timestamp object if conversion is successful.
    """
    
    try:
        # Try to parse as a full date
        return pd.to_datetime(entry)
    except ValueError:
        # Check if it's a four-digit number (likely representing a year)
        if entry.isdigit() and len(entry) == 4:
            return pd.to_datetime(entry, format='%Y')
        else:
            # If it's neither a full date nor a valid year, return NaN or handle as needed
            return pd.NaT

In [9]:
def is_roman_numeral(value):
    """
    Check if the given value is a valid Roman numeral.

    Args:
        value (str): The value to check.

    Returns:
        bool: True if the value is a valid Roman numeral, False otherwise.
    """
    
    # Define a regular expression to match Roman numerals
    roman_pattern = r'^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
    
    # Check if the value matches the Roman numeral pattern
    return bool(re.match(roman_pattern, value))

def has_digit(value):
    return bool(re.search(r'\d', str(value)))

def roman_or_numeral(value):
    if is_roman_numeral(value):
        return True
    elif has_digit(value):
        return True
    else:
        return False

In [10]:
def intersection_out(lst1, lst2):
    """
    Return a list containing elements from lst1 that are not present in lst2.
    """
    lst3 = [value for value in lst1 if value not in lst2]
    return lst3

def intersection_in(lst1, lst2):
    """
    Return a list containing elements from lst1 that are present in lst2 as well.
    """
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [11]:
def generalized_contains(cell, included_and=None, included_or=None, excluded_and=None, excluded_or=None):
    """
    Check if a given string `cell` meets specific inclusion and exclusion criteria.

    Args:
        cell (str): The string to check.
        included_and (list, optional): List of keywords; cell must include all of these (case insensitive).
        included_or (list, optional): List of keywords; cell must include at least one of these (case insensitive).
        excluded_and (list, optional): List of keywords; cell must not include any of these (case insensitive).
        excluded_or (list, optional): List of keywords; cell must not include at least one of these (case insensitive).

    Returns:
        bool: True if `cell` meets all specified criteria, False otherwise.
    """
    
    if not isinstance(cell, str):
        return False
    
    cell_lower = cell.lower()
    
    # Check included_and condition
    if included_and:
        if not all(keyword in cell_lower for keyword in included_and):
            return False
    
    # Check included_or condition
    if included_or:
        if not any(keyword in cell_lower for keyword in included_or):
            return False
    
    # Check excluded_and condition
    if excluded_and:
        if not all(keyword not in cell_lower for keyword in excluded_and):
            return False
    
    # Check excluded_or condition
    if excluded_or:
        if any(keyword in cell_lower for keyword in excluded_or):
            return False
    
    return True

### Example usage
cell = "This board member is a past chairperson."
included_and = ['board', 'member']
included_or = ['past', 'former']
excluded_and = None
excluded_or = None
generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)

cell = "This board member is a past chairperson."
included_and = ['board', 'member']
included_or = None
excluded_and = None
excluded_or = ['past', 'former']
generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)

cell = "prospect_manager_2nd"
included_and = None
included_or = ['prospect_manager', 'solicitor']
excluded_and = None
excluded_or = ["2"]
generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)

False

In [12]:
def pick_col(row, col1, col2):
    """
    Return a value from either col1 or col2 of a given row based on priority rules.

    Args:
        row (pd.Series): The row (Series) from which to pick values.
        col1 (str): The name of the first column to consider.
        col2 (str): The name of the second column to consider.

    Returns:
        object: The value from col1 or col2 based on the priority rules:
            - If both values are NaN, returns NaN.
            - If only one value is NaN, returns the non-NaN value.
            - If both values are equal and not NaN, returns that value.
            - Otherwise, returns NaN and prints the conflicting values (for debugging).
    """
    
    x, y = row[col1], row[col2]
    if pd.isna(x) and pd.isna(y):
        return np.nan
    elif pd.isna(x):
        return y
    elif pd.isna(y):
        return x
    elif x == y:
        return x
    else:
        print(x, y)
        return np.nan

In [13]:
def df_merge_source(merged_df, left_source, right_source):
    """
    Rename the '_merge' column in the merged dataframe to a source indicator column,
    and map its values to more descriptive labels based on merge sources.

    Args:
        merged_df (pd.DataFrame): The merged dataframe resulting from a merge operation.
        left_source (str): The label for the left dataframe source.
        right_source (str): The label for the right dataframe source.

    Returns:
        pd.DataFrame: The merged dataframe `merged_df` with the '_merge' column renamed to indicate source,
                     and its values mapped to descriptive labels based on merge sources.
    """
    # Rename the '_merge' column to 'source'
    new_col = "df_source"
    while new_col in merged_df.columns:
        i = 1
        new_col = "df_source_%d" %i
        i += 1
        
    merged_df.rename(columns={'_merge': new_col}, inplace=True)
    
    # Map the source column to more descriptive labels
    source_mapping = {
        'left_only': '%s' %left_source,
        'right_only': '%s' %right_source,
        'both': '%s/%s'%(left_source, right_source)
    }
    merged_df[new_col] = merged_df[new_col].replace(source_mapping)
    return merged_df

In [14]:
def remove_dollar_sign_and_comma(cell):
    """
    Remove dollar signs ('$') and commas (',') from a given string representation of a number.

    Args:
        cell (str or any): The string or value from which to remove dollar signs and commas.

    Returns:
        str or any: The modified string with dollar signs and commas removed, or the original value if not a string.
    """
    if isinstance(cell, str):
        cell = cell.replace("$", "").replace(",", "")
    return cell

In [15]:
def profile_report(df_cd, client_name, file_name_suffix="constituents"):
    profile = ProfileReport(
        df_cd, \
            title="Profiling Synthetic Data", \
            html={"style": {"full_width": True}}, 
    )
    profile.to_file("%s_%s_data_report.html" %(client_name, file_name_suffix))

In [16]:
def save_file(df, file_prefix, version):
    """
    Save a DataFrame to a CSV file with a specified file name format and path.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        file_prefix (str, optional): The prefix for the CSV file name. Default is "constituents_cleaned".
        version (str, optional): The version identifier for the CSV file name. Default is "v1".

    Returns:
        None
    """
    if "constituent_id" in df.columns:
        df = df.dropna(subset="constituent_id")
    file_name = "%s_%s.csv" %(file_prefix, version)
    df.to_csv("%s/%s/ccs_data_preprocessing/%s" %(file_path, client_name, file_name), index=False)

# **Variables**

In [18]:
#file_path = "./"
client_name = "synthetic_data"

In [19]:
save_constit_before_mapping = "constituents_before_mapping"
save_constit_after_mapping = "constituents_preprocessed"

In [20]:
mapper = "data_mapping_synthetic.csv"

# **Constituents Data**

In [26]:
# file_path = "./"
# file_name = "synthetic_constituents_data.csv"

# file = "%s/%s" %(file_path, file_name)
# df_cd = pd.read_csv(file, encoding="ISO-8859-1", low_memory=False, index_col=False)
#df_cd = df_cd.drop(columns=["Unnamed: 0"])

df_cd = civis.io.read_civis("public.synthetic_constituents_data", "CCS", use_pandas=True)
df_cd = df_cd.drop(columns=["column_0"])

In [28]:
#df_cd.head()

In [29]:
df_cd.shape

(248443, 53)

In [134]:
save_columns = False

if save_columns:
    pd.DataFrame(df_cd.columns.to_list(), columns=["Column Names"]).to_csv("input_constituents_columns.csv", index=False)

In [135]:
#profile_report(df_cd, client_name)

# **Formatting**

##### **Remove dollar sign and comma**

In [30]:
df_cd = df_cd.applymap(remove_dollar_sign_and_comma)

giving_columns = [key for key in df_cd.columns if "giving" in key.lower()]
print(giving_columns)
for col in giving_columns:
    df_cd[col] = pd.to_numeric(df_cd[col], errors='ignore')

['lifetime_giving', 'five_year_giving']


In [31]:
df_cd.shape

(248443, 53)

# **Renaming**

# **key_indicator**

# **is_individual**

# **is_deceased**

In [32]:
df_cd["is_deceased"] = df_cd["is_deceased"].astype(bool)
df_cd.is_deceased.value_counts()

is_deceased
False    248443
Name: count, dtype: int64

# **head_of_household and household_id**

**Householding** logic comprises several operations

1. **Function Definition (`get_max_index`)**:
   - **Purpose**: Determines the index of the row within a group that has the maximum value in the "Total Lifetime Giving" column, or defaults to the first row if the column is absent or all values are zero.
   - **Usage**: Applied later within a grouped dataframe to identify the row with the highest lifetime giving.

2. **Data Preparation and Cleaning**:
   - **Filtering Rows (`df_cdi`)**: Creates `df_cdi` by excluding rows where donors are deceased (`df_cd["is_deceased"]`) and not individuals (`df_cd["is_individual"]`), and sets a new column `head_of_household` to `True`.
   - **Drop NaN Rows**: Removes rows where both "address_1" and "address_2" are NaN.
   - **Fill NaN Values**: Fills NaN values in selected columns ("address_1", "address_2", "home_city", "home_state", "zip") with a placeholder value ('missing').
   - **Filter Numeric Addresses**: Filters rows where at least one of "address_1" or "address_2" contains a numeric or Roman numeral value.

3. **Grouping and Sorting**:
   - **Group by Address Details (`grouped`)**: Groups `df_cdi` by "address_1", "address_2", "home_city", "home_state", and "zip".
   - **Identify Rows with Multiple Entries (`df_cdi_multiple`)**: Filters `df_cdi` to include only rows with duplicate combinations of "address_1", "address_2", "home_city", "home_state", and "zip", sorting them for further operations.

4. **Assignment and Merging**:
   - **Identify Maximum Lifetime Giving (`idx_max_lifetime_giving`)**: Applies `get_max_index` function to `grouped` to find the row index with the maximum "Total Lifetime Giving" within each subgroup.
   - **Set Attributes (`head_of_household`, `household_id`)**:
     - Sets `head_of_household` to `False` for all rows in `df_cdi_multiple`.
     - Sets `head_of_household` to `True` for rows identified in `idx_max_lifetime_giving`.
     - Assigns a unique `household_id` to each subgroup in `df_cdi_multiple` based on its group index.
   - **Merge Attributes Back (`df_cdi` into `df_cd`)**: Merges `head_of_household` and `household_id` back into the original `df_cd` based on "constituent_id".
   - 
This process effectively categorizes donors into households (**household_id**), designates a head of household (**head_of_household**), and ensures that these attributes are correctly assigned across the entire dataframe (df_cd).

In [33]:
# Function to determine index of row with max "Total Lifetime Giving" or default to first row
def get_max_index(group):
    if "Total Lifetime Giving" in group.columns:
        if group["Total Lifetime Giving"].max() > 0:
            return group["Total Lifetime Giving"].idxmax()
    # If "Total Lifetime Giving" column is absent or all values are zero, choose the first row
    return group.index[0]

In [34]:
run = True

if run:
    # Select only individual constituents who are not deceased
    df_cdi = df_cd[~(df_cd["is_deceased"]) & (df_cd["is_individual"])]
    df_cdi["head_of_household"] = True
    print(len(df_cd), len(df_cdi))

    # Drop rows where both Addr1 and Addr2 are NaN
    df_cdi = df_cdi.dropna(subset=["address_1", "address_2"], how='all')

    # Fill NaNs with a placeholder value (e.g., 'missing')
    df_cdi = df_cdi.fillna({'address_1': 'missing', 'address_2': 'missing', 'home_city': 'missing', 'home_state': 'missing', 'zip': 'missing'})

    # Filter rows to ensure there's something numeric in at least one of Addr1 or Addr2
    df_cdi = df_cdi[df_cdi.apply(lambda row: roman_or_numeral(row["address_1"]) or roman_or_numeral(row["address_2"]), axis=1)]

    df_cdi_multiple = df_cdi[df_cdi.groupby(["address_1", "address_2", "home_city", "home_state", "zip"])\
                                .transform('size')>1]\
                                .sort_values(by=["address_1", "address_2", "home_city", "home_state", "zip"])

    # Create Groupby Object
    grouped = df_cdi_multiple.groupby(["address_1", "address_2", "home_city", "home_state", "zip"])

    # Identify index of rows with max "Total Lifetime Giving" or default to first row
    idx_max_lifetime_giving = grouped.apply(get_max_index).values

    # Set "head_of_household" to False for all entries in subgroups with multiple entries
    df_cdi.loc[df_cdi_multiple.index, "head_of_household"] = False
    df_cdi_multiple["head_of_household"] = False # not necessary but

    # Set "head_of_household" to True for the rows identified with max "Total Lifetime Giving"
    df_cdi.loc[idx_max_lifetime_giving, "head_of_household"] = True
    df_cdi_multiple.loc[idx_max_lifetime_giving, "head_of_household"] = True # not necessary but

    # Assign unique household_id to each subgroup
    df_cdi_multiple['household_id'] = df_cdi_multiple.groupby(["address_1", "address_2", "home_city", "home_state", "zip"])\
                                                .ngroup() + 1

    # Merge the household_id back into the original df_cdi
    df_cdi = df_cdi.merge(df_cdi_multiple[["constituent_id", "household_id"]], on="constituent_id", how="left")

    # Fill NaN values in household_id with 0 for those not in multiple entries groups
    df_cdi["household_id"] = df_cdi["household_id"].fillna(0).astype(bool)

    # Merge the head_of_household and household_id into df_cd
    df_cd = df_cd.merge(df_cdi[["constituent_id", "head_of_household", "household_id"]], \
                    on="constituent_id", how="left")

248443 248443


# **is_assigned**

LOGIC:

is_assigned is 1 if

solicitor_type = Solicitor, Prospect Manager
<br>solicitor_name != President or Lawrence A. Selzer

assigned_manager is not_null for is_assigned==1

1. **Get column names for solicitor types and names**:
   - **columns_solicitor_type**: retrieve column names containing "solicitor type" in their lowercase versions.
   - **columns_solicitor_name**: retrieve column names containing "solicitor name" in their lowercase versions.

2. **Initialization and Loop**:
   - Initializes a new column `df_cd["is_assigned"]` with a default value of `False`.
   - Loops through pairs of `columns_solicitor_type` and `columns_solicitor_name` using `zip`.

3. **Conditions Application**:
   - **is_assigned_1**: Applies conditions to `solicitor_type_col` checking for inclusion of "solicitor" or "prospect manager" and exclusion of "2".
   - **is_assigned_2**: Applies conditions to `solicitor_name_col` checking for exclusion of "president" or "lawrence a. selzer".

4. **Final Update (Wrong)**:
   - Updates `df_cd["is_assigned"]` based on logical OR of `is_assigned_1` and `is_assigned_2`.
4. **Final Update (Right)**:
   - Updates `df_cd["is_assigned"]` by evaluating each row where `df_cd["is_assigned_1"]` and `df_cd["is_assigned_2"]` are both `True`, using a logical AND operation (`&`). The result is then combined with the existing values in `df_cd["is_assigned"]` using a logical OR operation (`|`).

5. **Cleanup**:
   - Drops intermediate columns `is_assigned_1` and `is_assigned_2` from `df_cd`, handling errors with `errors='ignore'`.



In [35]:
run = False

if run:
    # Get column names for solicitor types and names
    columns_solicitor_type = [key for key in df_cd.columns if "solicitor type" in key.lower()]
    columns_solicitor_name = [key for key in df_cd.columns if "solicitor name" in key.lower()]

    # Initialize a column to store the final result
    df_cd["is_assigned"] = False

    # Loop through each pair of columns using zip
    for solicitor_type_col, solicitor_name_col in zip(columns_solicitor_type, columns_solicitor_name):
        # Apply the conditions for is_assigned_1
        included_and = None
        included_or = ["solicitor", "prospect manager"]
        excluded_and = None
        excluded_or = ["2"]
        
        df_cd["is_assigned_1"] = df_cd[solicitor_type_col].apply(
            lambda cell: generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)
        )

        # Apply the conditions for is_assigned_2
        included_and = None
        included_or = None
        excluded_and = ["president", "lawrence a. selzer"]
        excluded_or = None
        
        df_cd["is_assigned_2"] = df_cd[solicitor_name_col].apply(
            lambda cell: generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)
        )

        # Update the is_assigned_new column
        df_cd["is_assigned"] = df_cd["is_assigned"] | (df_cd["is_assigned_1"] & df_cd["is_assigned_2"])

    # Drop intermediate columns
    df_cd.drop(columns=["is_assigned_1", "is_assigned_2"], inplace=True, errors='ignore')

In [36]:
df_cd["is_assigned"] = df_cd["assigned_manager"].notna().astype(bool)
df_cd["is_assigned"].value_counts()

is_assigned
False    248230
True        213
Name: count, dtype: int64

# **assigned_manager**

1. **Define conditions and choices**:
   - **conditions**: List of conditions evaluating `df_cd["is_assigned"]` as `True` and checking each `Solicitor Name_X` column for non-null values while excluding "president" or "lawrence a. selzer".
   - **choices**: Corresponding `Solicitor Name_X` columns used when conditions are met.

2. **Apply np.select**:
   - Utilizes `np.select` to populate `df_cd["assigned_manager"]` based on the first matching condition, defaulting to `np.nan` if no conditions are satisfied.



# **has_solicit_codes**

# **marital_status**

# **has_spouse**
This step creates a binary field (True of False) indicating whether or not the constituent has a living spouse. The assessment is made based on columns that contain any spouse related data.

In [37]:
### has_spouse
columns_spouse = [key for key in df_cd.columns if "spouse" in key.lower() and "deceased" not in key.lower()]
df_cd["has_spouse"] =  df_cd[columns_spouse].notna().any(axis=1).astype(bool)
#df_cd = df_cd.drop(columns=columns_spouse)

### Print for illustration
df_cd["has_spouse"].value_counts()

has_spouse
False    231740
True      16703
Name: count, dtype: int64

# **Prefix**
The following step converts a specified prefix column to a binary column that indicates whether any of the specified patterns (e.g., "dr", "prof") are present in the prefix values. The original prefix column is dropped after the binary column is created.

In [38]:
df_cd["prefix"] = df_cd["prefix"].astype(str)
df_cd["has_prefix_dr"] = df_cd["prefix"].str.contains(r"(dr|prof)", case=False).astype(bool)
df_cd = df_cd.drop(columns=["prefix"])

### Print for illustration
df_cd["has_prefix_dr"].value_counts()

has_prefix_dr
False    242721
True       5722
Name: count, dtype: int64

# **five_year_giving**

# **planned_gift_commitment**

# **Address**

In [39]:
#[key for key in df_cd.columns if "seasonal" in key.lower()]
df_cd["has_business_address"] = df_cd["business_address"].notna().astype(bool)
df_cd["has_seasonal_address"] = df_cd["seasonal_address"].notna().astype(bool)


In [40]:
df_cd["has_screenable_address"] = df_cd[["address_1", "home_city", "home_state", "zip"]].isna().any(axis=1).astype(bool)
df_cd["has_screenable_address"].value_counts()

has_screenable_address
False    174953
True      73490
Name: count, dtype: int64

# **Phone**

# **Email**

# **Age**

# **DOB**

# **number_of_events_attended**

# **total_notes**

# **Loyalty**

In [41]:
#df_cd["major_donor_model_score"]=random.randint(100, size=(len(df_cd)))
#df_cd["loyalty_model_score"]=random.randint(100, size=(len(df_cd)))

df_cd["loyalty"]    = 0
df_cd["major_donor_model_score"] = 0
df_cd["loyalty_model_score"]    = 0

df_cd["n_years_giving_of_ten"] = np.nan

# **Mapper**

In [42]:
save_columns = False

if save_columns:
    pd.DataFrame(df_cd.columns.to_list(), columns=["Column Names"]).to_csv("input_constituents_columns.csv", index=False)

In [43]:
mapping = pd.read_csv("%s" %mapper)
column_mapping = {row["file_columns"]: row["expected_columns"] \
                  for index, row in mapping.iterrows() if row["file_columns"]!="not_found"}
df_final = df_cd.rename(columns=column_mapping)
df_final = df_final[list(column_mapping.values())]

# Check duplicate columns
duplicate_columns = df_final.columns[df_final.columns.duplicated()]
if len(duplicate_columns)>0:
    print("Duplicate columns found: ", duplicate_columns)
    df_final = df_final.loc[:,~df_final.columns.duplicated()]

In [44]:
A= df_cd.columns.to_list()
B= df_final.columns.to_list()
intersection_out(A, B)

['constituent_type_1',
 'spouse_id',
 'business_address',
 'seasonal_address',
 'wealth_screen_data',
 'current_trustee',
 'history_of_volunteer',
 'number_of_special_events_attended']

In [45]:
#df_cd.number_of_special_events_attended.value_counts()

# **Save the final preprocessed-file**

In [46]:
save = True
if save:
    df_final.to_csv("./synthetic_constituents_preprocessed_v1.csv")

In [47]:
!ls

README.md			    preprocessing_template.Rmd
data_mapping_synthetic.csv	    requirements.txt
preprocessing_synthetic_data.html   synthetic_constituents_preprocessed_v1.csv
preprocessing_synthetic_data.ipynb  variations


In [48]:
df_dfg = civis.io.read_civis("public.synthetic_gifts_data", "CCS", use_pandas=True)

In [50]:
df_dfg.head()
df_dfg = df_dfg.drop(columns="column_0")

In [51]:
save = True
if save:
    df_dfg.to_csv("./synthetic_gifts_preprocessed_v1.csv")

In [52]:
!ls

README.md			    requirements.txt
data_mapping_synthetic.csv	    synthetic_constituents_preprocessed_v1.csv
preprocessing_synthetic_data.html   synthetic_gifts_preprocessed_v1.csv
preprocessing_synthetic_data.ipynb  variations
preprocessing_template.Rmd


In [56]:
civis_fut_dfg = civis.io.dataframe_to_civis(df_dfg, "CCS", \
                                            "public.synthetic_gifts_preprocessed_v1", \
                                            existing_table_rows="drop")

In [58]:
civis_fut_df_cd = civis.io.dataframe_to_civis(df_cd, "CCS", \
                                              "public.synthetic_constituents_preprocessed_v1",\
                                             existing_table_rows="drop")

In [57]:
civis_fut_dfg.result()

Response(id=660521099,
         state='succeeded',
         created_at='2024-08-06T17:53:37.000Z',
         started_at='2024-08-06T17:53:38.000Z',
         finished_at='2024-08-06T17:53:40.000Z',
         error=None)

In [59]:
civis_fut_df_cd.result()

Response(id=660521282,
         state='succeeded',
         created_at='2024-08-06T17:54:50.000Z',
         started_at='2024-08-06T17:54:50.000Z',
         finished_at='2024-08-06T17:55:31.000Z',
         error=None)

In [60]:
!ls

README.md			    requirements.txt
data_mapping_synthetic.csv	    synthetic_constituents_preprocessed_v1.csv
preprocessing_synthetic_data.html   synthetic_gifts_preprocessed_v1.csv
preprocessing_synthetic_data.ipynb  variations
preprocessing_template.Rmd
