In [None]:
import subprocess
import pandas as pd
import chardet
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import argparse
import hashlib
from datetime import datetime
from configparser import ConfigParser
import sys
import warnings
import re
warnings.filterwarnings('ignore')
from numpy import random

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

# **Functions**

In [None]:
def col_eg(df, col):
   return df[(~df[col].isna()) & (df[col]!="nan")][col][1:10]

In [None]:
def convert_to_snake_case(col_name):
    # Remove '__c' from the column names
    col_name = col_name.replace('__c', '')
    
    # Replace periods with underscores
    col_name = col_name.replace('.', '_')
    
    # Convert camel case to snake case
    # Handle the case where a lowercase letter or digit is followed by an uppercase letter
    col_name = re.sub(r'([a-z0-9])([A-Z])', r'\1_\2', col_name)
    
    # Convert the entire string to lowercase
    col_name = col_name.lower()
    
    # Replace multiple underscores with a single underscore
    col_name = re.sub(r'_+', '_', col_name)
    
    return col_name

In [None]:
# Define a function to convert entries to datetime
def convert_to_datetime(entry):
    """
    Convert a given entry into a datetime object using pandas' to_datetime function.

    Args:
        entry (str): The entry to convert to datetime.

    Returns:
        pd.Timestamp: Returns a pandas Timestamp object if conversion is successful.
    """
    
    try:
        # Try to parse as a full date
        return pd.to_datetime(entry)
    except ValueError:
        # Check if it's a four-digit number (likely representing a year)
        if entry.isdigit() and len(entry) == 4:
            return pd.to_datetime(entry, format='%Y')
        else:
            # If it's neither a full date nor a valid year, return NaN or handle as needed
            return pd.NaT

In [None]:
def is_roman_numeral(value):
    """
    Check if the given value is a valid Roman numeral.

    Args:
        value (str): The value to check.

    Returns:
        bool: True if the value is a valid Roman numeral, False otherwise.
    """
    
    # Define a regular expression to match Roman numerals
    roman_pattern = r'^M{0,3}(CM|CD|D?C{0,3})(XC|XL|L?X{0,3})(IX|IV|V?I{0,3})$'
    
    # Check if the value matches the Roman numeral pattern
    return bool(re.match(roman_pattern, value))

def has_digit(value):
    return bool(re.search(r'\d', str(value)))

def roman_or_numeral(value):
    if is_roman_numeral(value):
        return True
    elif has_digit(value):
        return True
    else:
        return False

In [None]:
def intersection_out(lst1, lst2):
    """
    Return a list containing elements from lst1 that are not present in lst2.
    """
    lst3 = [value for value in lst1 if value not in lst2]
    return lst3

def intersection_in(lst1, lst2):
    """
    Return a list containing elements from lst1 that are present in lst2 as well.
    """
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

In [None]:
def generalized_contains(cell, included_and=None, included_or=None, excluded_and=None, excluded_or=None):
    """
    Check if a given string `cell` meets specific inclusion and exclusion criteria.

    Args:
        cell (str): The string to check.
        included_and (list, optional): List of keywords; cell must include all of these (case insensitive).
        included_or (list, optional): List of keywords; cell must include at least one of these (case insensitive).
        excluded_and (list, optional): List of keywords; cell must not include any of these (case insensitive).
        excluded_or (list, optional): List of keywords; cell must not include at least one of these (case insensitive).

    Returns:
        bool: True if `cell` meets all specified criteria, False otherwise.
    """
    
    if not isinstance(cell, str):
        return False
    
    cell_lower = cell.lower()
    
    # Check included_and condition
    if included_and:
        if not all(keyword in cell_lower for keyword in included_and):
            return False
    
    # Check included_or condition
    if included_or:
        if not any(keyword in cell_lower for keyword in included_or):
            return False
    
    # Check excluded_and condition
    if excluded_and:
        if not all(keyword not in cell_lower for keyword in excluded_and):
            return False
    
    # Check excluded_or condition
    if excluded_or:
        if any(keyword in cell_lower for keyword in excluded_or):
            return False
    
    return True

### Example usage
cell = "This board member is a past chairperson."
included_and = ['board', 'member']
included_or = ['past', 'former']
excluded_and = None
excluded_or = None
generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)

cell = "This board member is a past chairperson."
included_and = ['board', 'member']
included_or = None
excluded_and = None
excluded_or = ['past', 'former']
generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)

cell = "prospect_manager_2nd"
included_and = None
included_or = ['prospect_manager', 'solicitor']
excluded_and = None
excluded_or = ["2"]
generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)

In [None]:
def pick_col(row, col1, col2):
    """
    Return a value from either col1 or col2 of a given row based on priority rules.

    Args:
        row (pd.Series): The row (Series) from which to pick values.
        col1 (str): The name of the first column to consider.
        col2 (str): The name of the second column to consider.

    Returns:
        object: The value from col1 or col2 based on the priority rules:
            - If both values are NaN, returns NaN.
            - If only one value is NaN, returns the non-NaN value.
            - If both values are equal and not NaN, returns that value.
            - Otherwise, returns NaN and prints the conflicting values (for debugging).
    """
    
    x, y = row[col1], row[col2]
    if pd.isna(x) and pd.isna(y):
        return np.nan
    elif pd.isna(x):
        return y
    elif pd.isna(y):
        return x
    elif x == y:
        return x
    else:
        print(x, y)
        return np.nan

In [None]:
def df_merge_source(merged_df, left_source, right_source):
    """
    Rename the '_merge' column in the merged dataframe to a source indicator column,
    and map its values to more descriptive labels based on merge sources.

    Args:
        merged_df (pd.DataFrame): The merged dataframe resulting from a merge operation.
        left_source (str): The label for the left dataframe source.
        right_source (str): The label for the right dataframe source.

    Returns:
        pd.DataFrame: The merged dataframe `merged_df` with the '_merge' column renamed to indicate source,
                     and its values mapped to descriptive labels based on merge sources.
    """
    # Rename the '_merge' column to 'source'
    new_col = "df_source"
    while new_col in merged_df.columns:
        i = 1
        new_col = "df_source_%d" %i
        i += 1
        
    merged_df.rename(columns={'_merge': new_col}, inplace=True)
    
    # Map the source column to more descriptive labels
    source_mapping = {
        'left_only': '%s' %left_source,
        'right_only': '%s' %right_source,
        'both': '%s/%s'%(left_source, right_source)
    }
    merged_df[new_col] = merged_df[new_col].replace(source_mapping)
    return merged_df

In [None]:
def remove_dollar_sign_and_comma(cell):
    """
    Remove dollar signs ('$') and commas (',') from a given string representation of a number.

    Args:
        cell (str or any): The string or value from which to remove dollar signs and commas.

    Returns:
        str or any: The modified string with dollar signs and commas removed, or the original value if not a string.
    """
    if isinstance(cell, str):
        cell = cell.replace("$", "").replace(",", "")
    return cell

In [None]:
def save_file(df, file_prefix, version):
    """
    Save a DataFrame to a CSV file with a specified file name format and path.

    Args:
        df (pd.DataFrame): The DataFrame to save.
        file_prefix (str, optional): The prefix for the CSV file name. Default is "constituents_cleaned".
        version (str, optional): The version identifier for the CSV file name. Default is "v1".

    Returns:
        None
    """
    if "Unique Donor ID" in df.columns:
        df = df.dropna(subset="Unique Donor ID")
    file_name = "%s_%s.csv" %(file_prefix, version)
    df.to_csv("%s\\%s\\Preprocessing\\%s" %(file_path, clients, file_name), index=False)

# **Variables**

In [None]:
user = "Rmittal"
file_path = "C:\\Users\\%s\\CCS\\Internal - Analytics - Shared Drive\\1. Shared Drive\\Clients" %user
clients = "National Multiple Sclerosis Society"

In [None]:
save_constit_before_mapping = "constituents_before_mapping"
save_constit_after_mapping = "constituents_preprocessed"
save_gifts_after_mapping = "gifts_preprocessed"

In [None]:
df_cd["number_of_special_events_attended"] = df_cd[["Events", "Events_1", "Events_2", "Events_3", \
                                                  "Events_4", "Events_5"]].notna().sum(axis=1)

# **Gifts Data**

In [None]:
file_path = "C:\\Users\\Rmittal\\CCS\\Internal - Analytics - Shared Drive\\1. Shared Drive\\Clients"
subdir = "Raw Client Data" 
clients = "The Conservation Fund"
file_name = "gifts_file.csv"

file = "%s\\%s\\%s\\%s" %(file_path, clients, subdir, file_name)
dfg = pd.read_csv(file, encoding="ISO-8859-1")

##### **when the gift_type is "write_off", the amount needs to be made negative**

In [None]:
#dfg_unq.columns
save_file(dfg_unq, save_gifts_after_mapping)

# **Constituents Data**

In [None]:
file_path = "C:\\Users\\Rmittal\\CCS\\Internal - Analytics - Shared Drive\\1. Shared Drive\\Clients"
subdir = "Raw Client Data" 
clients = "The Conservation Fund"
file_name = "constituents_file.csv"

file = "%s\\%s\\%s\\%s" %(file_path, clients, subdir, file_name)
df_cd = pd.read_csv(file, encoding="ISO-8859-1")

# **Formatting**

##### **Remove dollar sign and comma**

In [None]:
df_cd = df_cd.applymap(remove_dollar_sign_and_comma)
df_cd["Total Lifetime Giving"].head()

givings_columns = [key for key in df_cd.columns if "giving" in key.lower()]
for col in givings_columns:
    df_cd[col] = pd.to_numeric(df_cd[col], errors='ignore')

# **Renaming**

In [None]:
df_cd = df_cd.rename(columns={"Key Indicator": "key_indicator", "Address Type": "preferred_address_type"})

# **key_indicator**

# **is_individual**

In [None]:
df_cd["is_individual"] = np.where(df_cd["key_indicator"]=="I", True, False)

# **is_deceased**

In [None]:
df_cd["is_deceased"] = np.where(df_cd["Deceased"]=="Yes", True, False)

In [None]:
# Select columns that contain "constituent" in their names
columns_constituent = [key for key in df_cd.columns if "constituent" in key.lower()]

included_and = ['board', 'member']
included_or = None
excluded_and = ['past', 'former']
excluded_or = None

# Apply generalized_contains to the DataFrame
df_cd["current_trustee_indicator"] = df_cd[columns_constituent].applymap(
    lambda cell: generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)
).any(axis=1)

included_and = ['board', 'member']
included_or = ['past', 'former']
excluded_and = None
excluded_or = None

# Apply generalized_contains to the DataFrame
df_cd["past_trustee_indicator"] = df_cd[columns_constituent].applymap(
    lambda cell: generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)
).any(axis=1)

# **head_of_household and household_id**

**Householding** logic comprises several operations

1. **Function Definition (`get_max_index`)**:
   - **Purpose**: Determines the index of the row within a group that has the maximum value in the "Total Lifetime Giving" column, or defaults to the first row if the column is absent or all values are zero.
   - **Usage**: Applied later within a grouped dataframe to identify the row with the highest lifetime giving.

2. **Data Preparation and Cleaning**:
   - **Filtering Rows (`df_cdi`)**: Creates `df_cdi` by excluding rows where donors are deceased (`df_cd["is_deceased"]`) and not individuals (`df_cd["is_individual"]`), and sets a new column `head_of_household` to `True`.
   - **Drop NaN Rows**: Removes rows where both "Address 1" and "Address 2" are NaN.
   - **Fill NaN Values**: Fills NaN values in selected columns ("Address 1", "Address 2", "City", "State", "Zip") with a placeholder value ('missing').
   - **Filter Numeric Addresses**: Filters rows where at least one of "Address 1" or "Address 2" contains a numeric or Roman numeral value.

3. **Grouping and Sorting**:
   - **Group by Address Details (`grouped`)**: Groups `df_cdi` by "Address 1", "Address 2", "City", "State", and "Zip".
   - **Identify Rows with Multiple Entries (`df_cdi_multiple`)**: Filters `df_cdi` to include only rows with duplicate combinations of "Address 1", "Address 2", "City", "State", and "Zip", sorting them for further operations.

4. **Assignment and Merging**:
   - **Identify Maximum Lifetime Giving (`idx_max_lifetime_giving`)**: Applies `get_max_index` function to `grouped` to find the row index with the maximum "Total Lifetime Giving" within each subgroup.
   - **Set Attributes (`head_of_household`, `household_id`)**:
     - Sets `head_of_household` to `False` for all rows in `df_cdi_multiple`.
     - Sets `head_of_household` to `True` for rows identified in `idx_max_lifetime_giving`.
     - Assigns a unique `household_id` to each subgroup in `df_cdi_multiple` based on its group index.
   - **Merge Attributes Back (`df_cdi` into `df_cd`)**: Merges `head_of_household` and `household_id` back into the original `df_cd` based on "Unique Donor ID".
   - 
This process effectively categorizes donors into households (**household_id**), designates a head of household (**head_of_household**), and ensures that these attributes are correctly assigned across the entire dataframe (df_cd).

In [None]:
# Function to determine index of row with max "Total Lifetime Giving" or default to first row
def get_max_index(group):
    if "Total Lifetime Giving" in group.columns:
        if group["Total Lifetime Giving"].max() > 0:
            return group["Total Lifetime Giving"].idxmax()
    # If "Total Lifetime Giving" column is absent or all values are zero, choose the first row
    return group.index[0]

In [None]:
df_cdi = df_cd[~(df_cd["is_deceased"]) & (df_cd["is_individual"])]
df_cdi["head_of_household"] = True
len(df_cd), len(df_cdi)

In [None]:
# Drop rows where both Addr1 and Addr2 are NaN
df_cdi = df_cdi.dropna(subset=["Address 1", "Address 2"], how='all')

# Fill NaNs with a placeholder value (e.g., 'missing')
df_cdi = df_cdi.fillna({'Address 1': 'missing', 'Address 2': 'missing', 'City': 'missing', 'State': 'missing', 'Zip': 'missing'})

# Filter rows to ensure there's something numeric in at least one of Addr1 or Addr2
df_cdi = df_cdi[df_cdi.apply(lambda row: roman_or_numeral(row["Address 1"]) or roman_or_numeral(row["Address 2"]), axis=1)]

df_cdi_multiple = df_cdi[df_cdi.groupby(["Address 1", "Address 2", "City", "State", "Zip"])\
                            .transform('size')>1]\
                            .sort_values(by=["Address 1", "Address 2", "City", "State", "Zip"])

# Create Groupby Object
grouped = df_cdi_multiple.groupby(["Address 1", "Address 2", "City", "State", "Zip"])

# Identify index of rows with max "Total Lifetime Giving" or default to first row
idx_max_lifetime_giving = grouped.apply(get_max_index).values

# Set "head_of_household" to False for all entries in subgroups with multiple entries
df_cdi.loc[df_cdi_multiple.index, "head_of_household"] = False
df_cdi_multiple["head_of_household"] = False # not necessary but

# Set "head_of_household" to True for the rows identified with max "Total Lifetime Giving"
df_cdi.loc[idx_max_lifetime_giving, "head_of_household"] = True
df_cdi_multiple.loc[idx_max_lifetime_giving, "head_of_household"] = True # not necessary but

# Assign unique household_id to each subgroup
df_cdi_multiple['household_id'] = df_cdi_multiple.groupby(["Address 1", "Address 2", "City", "State", "Zip"])\
                                               .ngroup() + 1

# Merge the household_id back into the original df_cdi
df_cdi = df_cdi.merge(df_cdi_multiple[["Unique Donor ID", "household_id"]], on="Unique Donor ID", how="left")

# Fill NaN values in household_id with 0 for those not in multiple entries groups
df_cdi["household_id"] = df_cdi["household_id"].fillna(0).astype(int)

# Merge the head_of_household and household_id into df_cd
df_cd = df_cd.merge(df_cdi[["Unique Donor ID", "head_of_household", "household_id"]], \
                  on="Unique Donor ID", how="left")

# **is_assigned**

LOGIC:

is_assigned is 1 if

solicitor_type = Solicitor, Prospect Manager
<br>solicitor_name != President or Lawrence A. Selzer

assigned_manager is not_null for is_assigned==1

1. **Get column names for solicitor types and names**:
   - **columns_solicitor_type**: retrieve column names containing "solicitor type" in their lowercase versions.
   - **columns_solicitor_name**: retrieve column names containing "solicitor name" in their lowercase versions.

2. **Initialization and Loop**:
   - Initializes a new column `df_cd["is_assigned"]` with a default value of `False`.
   - Loops through pairs of `columns_solicitor_type` and `columns_solicitor_name` using `zip`.

3. **Conditions Application**:
   - **is_assigned_1**: Applies conditions to `solicitor_type_col` checking for inclusion of "solicitor" or "prospect manager" and exclusion of "2".
   - **is_assigned_2**: Applies conditions to `solicitor_name_col` checking for exclusion of "president" or "lawrence a. selzer".

4. **Final Update (Wrong)**:
   - Updates `df_cd["is_assigned"]` based on logical OR of `is_assigned_1` and `is_assigned_2`.
4. **Final Update (Right)**:
   - Updates `df_cd["is_assigned"]` by evaluating each row where `df_cd["is_assigned_1"]` and `df_cd["is_assigned_2"]` are both `True`, using a logical AND operation (`&`). The result is then combined with the existing values in `df_cd["is_assigned"]` using a logical OR operation (`|`).

5. **Cleanup**:
   - Drops intermediate columns `is_assigned_1` and `is_assigned_2` from `df_cd`, handling errors with `errors='ignore'`.



In [None]:
# Get column names for solicitor types and names
columns_solicitor_type = [key for key in df_cd.columns if "solicitor type" in key.lower()]
columns_solicitor_name = [key for key in df_cd.columns if "solicitor name" in key.lower()]

# Initialize a column to store the final result
df_cd["is_assigned"] = False

# Loop through each pair of columns using zip
for solicitor_type_col, solicitor_name_col in zip(columns_solicitor_type, columns_solicitor_name):
    # Apply the conditions for is_assigned_1
    included_and = None
    included_or = ["solicitor", "prospect manager"]
    excluded_and = None
    excluded_or = ["2"]
    
    df_cd["is_assigned_1"] = df_cd[solicitor_type_col].apply(
        lambda cell: generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)
    )

    # Apply the conditions for is_assigned_2
    included_and = None
    included_or = None
    excluded_and = ["president", "lawrence a. selzer"]
    excluded_or = None
    
    df_cd["is_assigned_2"] = df_cd[solicitor_name_col].apply(
        lambda cell: generalized_contains(cell, included_and, included_or, excluded_and, excluded_or)
    )

    # Update the is_assigned_new column
    df_cd["is_assigned"] = df_cd["is_assigned"] | (df_cd["is_assigned_1"] & df_cd["is_assigned_2"])

# Drop intermediate columns
df_cd.drop(columns=["is_assigned_1", "is_assigned_2"], inplace=True, errors='ignore')

# **assigned_manager**

1. **Define conditions and choices**:
   - **conditions**: List of conditions evaluating `df_cd["is_assigned"]` as `True` and checking each `Solicitor Name_X` column for non-null values while excluding "president" or "lawrence a. selzer".
   - **choices**: Corresponding `Solicitor Name_X` columns used when conditions are met.

2. **Apply np.select**:
   - Utilizes `np.select` to populate `df_cd["assigned_manager"]` based on the first matching condition, defaulting to `np.nan` if no conditions are satisfied.



In [None]:
# Define conditions
conditions = [
    ((df_cd["is_assigned"]==True) & (df_cd["Solicitor Name_1"].notna()) \
     & ~(df_cd["Solicitor Name_1"].str.contains("president|lawrence a. selzer", case=False, na=False))),
    ((df_cd["is_assigned"]==True) & (df_cd["Solicitor Name_2"].notna()) \
     & ~(df_cd["Solicitor Name_2"].str.contains("president|lawrence a. selzer", case=False, na=False))),
    ((df_cd["is_assigned"]==True) & (df_cd["Solicitor Name_3"].notna()) \
     & ~(df_cd["Solicitor Name_3"].str.contains("president|lawrence a. selzer", case=False, na=False))),
    ((df_cd["is_assigned"]==True) & (df_cd["Solicitor Name_4"].notna()) \
     & ~(df_cd["Solicitor Name_4"].str.contains("president|lawrence a. selzer", case=False, na=False))),
    ((df_cd["is_assigned"]==True) & (df_cd["Solicitor Name_5"].notna()) \
     & ~(df_cd["Solicitor Name_5"].str.contains("president|lawrence a. selzer", case=False, na=False))),
    ((df_cd["is_assigned"]==True) & (df_cd["Solicitor Name_6"].notna()) \
     & ~(df_cd["Solicitor Name_6"].str.contains("president|lawrence a. selzer", case=False, na=False))),
]

# Define choices corresponding to the conditions
choices = [
    df_cd["Solicitor Name_1"],
    df_cd["Solicitor Name_2"],
    df_cd["Solicitor Name_3"],
    df_cd["Solicitor Name_4"],
    df_cd["Solicitor Name_5"],
    df_cd["Solicitor Name_6"]
]

# Use np.select to create the 'personal_email' column
df_cd["assigned_manager"] = np.select(conditions, choices, default=np.nan)

In [None]:
#df_cd.groupby(["assigned_manager"]).size().sort_values()

# **solicit_codes**

In [None]:
if "solicit_codes" in df_cd.columns:
    df_cd = df_cd.drop(columns=["solicit_codes"])
columns_solicit_codes = [key for key in df_cd.columns if "solicit" in key.lower() and "code" in key.lower()]
df_cd["solicit_codes"] =  df_cd[columns_solicit_codes].notna().any(axis=1).astype(int)
df_cd["solicit_codes"].value_counts()

# **marital_status**

In [None]:
columns_spouse = [key for key in df_cd.columns if "spouse" in key.lower() and "deceased" not in key.lower()]
#columns_spouse
df_cd["marital_status"] = np.where(df_cd[columns_spouse].notna().any(axis=1), "married", np.nan)
df_cd["marital_status"] = np.where(df_cd["Spouse deceased"], "widow", df_cd["marital_status"])

# **five_year_giving**

In [None]:
df_cd["five_year_giving"] =  df_cd["2024 Giving Total"] + df_cd["2023 Giving Total"] + df_cd["2022 Giving Total"] + df_cd["2021 Giving Total"] \
                        + df_cd["2020 Giving Total"] + df_cd["2019 Giving Total"] 

In [None]:
#df_cd["major_donor_model_score"]=random.randint(100, size=(len(df_cd)))
#df_cd["loyalty_model_score"]=random.randint(100, size=(len(df_cd)))

df_cd["major_donor_model_score"] = 0
df_cd["loyalty_model_score"]    = 0

df_cd["n_years_giving_of_ten"] = np.nan

# **planned_gift_commitment**

In [None]:
#df_cd.columns.to_list()
columns_planned = [key for key in df_cd.columns if "planned" in key.lower()]
df_cd["planned_gift_commitment"] = np.where(df_cd[columns_planned].notna().any(axis=1), 1, 0)

# **Address**

In [None]:
df_cd["business_address"] = np.where(df_cd["preferred_address_type"]=="Business", True, False)
df_cd["seasonal_address"] = np.where(df_cd["preferred_address_type"]\
                                    .str.contains("summer|winter|alternate", case=False, na=False), True, False)

# **Phone**

In [None]:
df_cd["home_phone"] = np.where(df_cd["Phone Type"] == "Home", df_cd["Phone Number"], np.nan)

# Updating the condition for the second pair of columns
df_cd["home_phone"] = np.where(
    (df_cd["Phone Type_1"] == "Home") & df_cd["home_phone"].isna(), 
    df_cd["Phone Number_1"], 
    df_cd["home_phone"]
)

df_cd["cell_phone"] = np.where(df_cd["Phone Type"] == "Cell", df_cd["Phone Number"], np.nan)

# Updating the condition for the second pair of columns
df_cd["cell_phone"] = np.where(
    (df_cd["Phone Type_1"] == "Cell") & df_cd["cell_phone"].isna(), 
    df_cd["Phone Number_1"], 
    df_cd["cell_phone"]
)

# **Email**

In [None]:
# Define conditions
conditions = [
    df_cd["Email"].notna(),
    df_cd["Email_1"].notna(),
    df_cd["Email_2"].notna(),
    df_cd["Email_3"].notna()
]

# Define choices corresponding to the conditions
choices = [
    df_cd["Email"],
    df_cd["Email_1"],
    df_cd["Email_2"],
    df_cd["Email_3"]
]

# Use np.select to create the 'personal_email' column
df_cd["personal_email"] = np.select(conditions, choices, default=np.nan)

# **number_of_events_attended**

In [None]:
df_cd["number_of_special_events_attended"] = df_cd[["Events", "Events_1", "Events_2", "Events_3", \
                                                  "Events_4", "Events_5"]].notna().sum(axis=1)

# **total_notes**

In [None]:
df_cd["Total_Notes"]
df_cd["Total_Notes"] = df_cd["Total_Notes"].fillna(0).astype(int)
df_cd["Total Notes"] = df_cd["Total Notes"].fillna(0).astype(int)
df_cd["total_notes"] = df_cd["Total_Notes"] + df_cd["Total Notes"] 

# **Loyalty**

In [None]:
columns_yearly_giving = ["2023 Total Giving", "2022 Total Giving", "2021 Total Giving",\
                         "2020 Total Giving", "2019 Total Giving", "2018 Total Giving",\
                         "2017 Total Giving", "2016 Total Giving", "2015 Total Giving",\
                         "2014 Total Giving"]

In [None]:
# Determine the first year a gift was made
def first_gift_year(row):
    first_gift_year = None
    for i, column in enumerate(columns_yearly_giving):
        if row[column] > 0:
            first_gift_year = int(current_year) - (i+1)
    return first_gift_year

# Calculate the number of years gifts were made
def years_gifts_made(row):
    return sum(row[column] > 0 for column in columns_yearly_giving)

# Calculate loyalty
def calculate_loyalty(row):
    if row["first_gift_year"] is not None:
        return row["number_of_years_gifts_were_made"] / (current_year - row["first_gift_year"])
    return 0

In [None]:
# Get the current year
current_year = datetime.now().year

df_cd["first_gift_year"] = df_cd.apply(first_gift_year, axis=1)
df_cd["number_of_years_gifts_were_made"] = df_cd.apply(years_gifts_made, axis=1)
df_cd["loyalty"] = df_cd.apply(calculate_loyalty, axis=1)

# **Save file before applying mapper**

In [None]:
save_file(df_cd, save_constit_before_mapping, "v1")

# **Mapper**

In [None]:
mapping = pd.read_csv("./constit_mapping_tcf.csv")
column_mapping = {row["file_columns"]: row["expected_columns"] \
                  for index, row in mapping.iterrows() if row["file_columns"]!="not_found"}
df_final = df_cd.rename(columns=column_mapping)
df_final = df_final[list(column_mapping.values())]

# **Datatypes**

### **Age**

In [None]:
# Apply the function to the column
df_final["age"] = df_final["age"].apply(convert_to_datetime)

# **Save the final preprocessed-file**

In [None]:
save = False
if save:
    save_file(df_final, save_constit_after_mapping, "v1")

### **Correlation Plots with Total Lifetime Giving**

In [None]:
# Drop rows with NaNs in these columns
df_cleaned = df_cdi_multiple.dropna(subset=["Total Lifetime Giving", "Total Actions"])
plt.figure(figsize=(8,2))
plt.scatter(np.log(df_cleaned["Total Lifetime Giving"].astype(float)), \
            df_cleaned["Total Actions"].astype(int))
plt.xlabel("Total Lifetime Giving")
plt.ylabel("Total Actions")
plt.title("Scatter Plot of Total Lifetime Giving vs. Total Actions")
plt.show()

df_cleaned = df_cdi_multiple.dropna(subset=["Total Lifetime Giving", "Pledged Planned Gift"])
plt.figure(figsize=(8,2))
plt.scatter(np.log(df_cleaned["Total Lifetime Giving"].astype(float)), \
            df_cleaned["Pledged Planned Gift"].astype(float))
plt.xlabel("Total Lifetime Giving")
plt.ylabel("Pledged Planned Gift")
plt.title("Scatter Plot of Total Lifetime Giving vs. Pledged Planned Gift")
plt.show()

# **Preprocessing after mapping**

In [None]:
df["class_year"] = df["class_year"].str.split("'",expand=True)[1]
df["class_year"] = df["class_year"].fillna(np.nan).astype('Int64')

In [None]:
#df["class_year"] = np.where(((df["class_year"] > 50) & (~df["class_year"].isna())), "19" + df["class_year"].astype(str), \
#                            "20" + df["class_year"].astype(str))
mask = ~df["class_year"].isna()
df.loc[mask, "class_year"] = np.where((df.loc[mask, "class_year"] > 25),
                                      "19" + df.loc[mask, "class_year"].astype(str),
                                      "20" + df.loc[mask, "class_year"].astype(str))


In [None]:
cols_datetime = list(df.select_dtypes(include=['datetime']).columns)
for col in cols_datetime:
    df[col] = pd.to_datetime(df[col]).dt.date