In [43]:
import dataiku
import pandas as pd, numpy as np
from dataiku import pandasutils as pdu
import re # regex for string matching

# Important Notes (READ BEFORE RUNNING CODE):
# I flagged code areas that need user interaction with *UPDATE HERE* so ctrl-F *UPDATE HERE* whenever you do a new upload
# Ctrl-F *UPDATE GIFT TRACKING* after you run the code to find the data to update in the Gift Data Tracking workbook
# At any point if you want to check the data in Excel, ctrl-F "to_csv" to find the CSV export you need.
# Uncomment it out, update the path & run the code.

# Process gift data one year at a time

# *UPDATE HERE* - every CSV import below

# 1. UPDATE ACCORDING TO YEAR, T3010 Donees dataset - delete all unnecessary columns first
donees_csv = dataiku.Dataset("T3010_Donees_2019_CW_prepped")
donees = donees_csv.get_dataframe()

# 2. UPDATE ACCORDING TO YEAR, Master Reference Table - Gift Data from Gift Data Tracking Sheet
#    If no gift data from this year has been added to GC yet, then just upload a CSV with the headers + empty columns
reference_csv = dataiku.Dataset("Gift_Data_Tracking_2019")
reference = reference_csv.get_dataframe()

# 3. Masterlist report from Contenta containing all foundations we have in GC
masterlist_csv = dataiku.Dataset("Contenta_MasterList")
masterlist = masterlist_csv.get_dataframe()

# 4. CRA Charities - All
charities_csv = dataiku.Dataset("Charities_All")
charities = charities_csv.get_dataframe()


  if self.run_code(code, result):


In [44]:
# Step 4

# *UPDATE HERE* - if necessary
# Use this commented-out code if unnecessary columns haven't been deleted yet
donees = pd.DataFrame(donees, columns= ['BN/Registration number', 'Donee Business number', 'Donee Name', 
                                        'City', 'Total amount gifts'])

donees.rename(columns = {"BN/Registration number": "BN", "Donee Business number": "DoneeBN",
                        "Donee Name": "DoneeName", "Total amount gifts": "ReportedAmt"}, inplace = True)
print(donees)

                     BN          DoneeBN                                          DoneeName                    City     ReportedAmt
0       100048800RR0001  118812627RR0001                                         B'NAI ZION                 TORONTO         $450.00
1       100048800RR0001              NaN                                     ORTHODOX UNION                     USA       $1,000.00
2       100048800RR0001  119038628RR0001                                            MEOROTH                 TORONTO         $880.00
3       100071927RR0001  737775486RR0001             FONDATION AFS INTERCULTURE CANADA INC.                MONTRÉAL     $200,000.00
4       100072586RR0001   857802409RR001                        GLOBAL CENTRE FOR PLURALISM                  OTTAWA       $3,204.00
5       100072586RR0001  895647055RR0001            FOCUS HUMANITARIAN ASSISTANCE IN CANADA                 TORONTO       $9,926.00
6       100072586RR0001  821197514RR0001                                THE 

In [45]:
# Add columns
# *UPDATE HERE* - Update Year accordingly
donees['Year'] = 2019
donees['Purpose'] = ""
# donees['Foundation Activity'] = [] - might not need to add now, will add in step 7

print(donees)

                     BN          DoneeBN                                          DoneeName                    City     ReportedAmt  Year Purpose
0       100048800RR0001  118812627RR0001                                         B'NAI ZION                 TORONTO         $450.00  2019        
1       100048800RR0001              NaN                                     ORTHODOX UNION                     USA       $1,000.00  2019        
2       100048800RR0001  119038628RR0001                                            MEOROTH                 TORONTO         $880.00  2019        
3       100071927RR0001  737775486RR0001             FONDATION AFS INTERCULTURE CANADA INC.                MONTRÉAL     $200,000.00  2019        
4       100072586RR0001   857802409RR001                        GLOBAL CENTRE FOR PLURALISM                  OTTAWA       $3,204.00  2019        
5       100072586RR0001  895647055RR0001            FOCUS HUMANITARIAN ASSISTANCE IN CANADA                 TORONTO       $9

In [46]:
# Step 5 - Identify and remove any funder not in GC

# Rename masterlist Funder BN to match Donees BN column so we can join the two datasets
masterlist.rename(columns = {"BusinessNumber": "BN"}, inplace = True)
mast_bn = masterlist['BN'].tolist()

step_five = donees[donees['BN'].isin(mast_bn)]
print(step_five)

                     BN          DoneeBN                                          DoneeName                    City    ReportedAmt  Year Purpose
206     100504679RR0001  100504687RR0001                              IMPERIAL THEATRE INC.              SAINT JOHN    $335,968.00  2019        
228     101015469RR0001  893081075RR0001    KINCARDINE AND COMMUNITY HEALTH CARE FOUNDATION              KINCARDINE     $20,000.00  2019        
262     101469609RR0001  118963081RR0001    INSTITUT UNIVERSITAIRE EN SANTÉ MENTALE DOUGLAS                MONTRÉAL    $239,071.00  2019        
263     101469609RR0001  118846179RR0001           CENTRE DE RECHERCHE DE L'HÔPITAL DOUGLAS                MONTRÉAL  $1,762,099.00  2019        
266     101569697RR0001  119301034RR0001  Habitat for Humanity Manitoba (Winkler-Morden ...                Winnipeg      $5,000.00  2019        
267     101569697RR0001  870071370RR0001                                     Eden East Inc.                 Winkler     $79,868.00

In [47]:
# Step 6 - Identify and remove any gifts already uploaded into GC
# Only check against reference gifts from the same year as the donees data

ref_bn = reference['BN'].tolist()

# ~ = is not in, i.e. this is keeping only BNs not in ref_bn
donees = step_five[~step_five['BN'].isin(ref_bn)]

print(donees)


                     BN          DoneeBN                                          DoneeName                      City    ReportedAmt  Year Purpose
310     101694578RR0001  119147072RR0001                      SHAAREI SHOMAYIM CONGREGATION                   TORONTO        $500.00  2019        
311     101694578RR0001  842008278RR0001                                          THE HOUSE                   TORONTO      $3,000.00  2019        
312     101694578RR0001  129950291RR0001                         NETIVOT HATORAH DAY SCHOOL                 THORNHILL      $3,000.00  2019        
313     101694578RR0001  119043891RR0001                                 MIZRACHI OF CANADA                 THORNHILL        $750.00  2019        
314     101694578RR0001  863109757RR0001                                       JACS TORONTO                   TORONTO      $1,360.00  2019        
315     101694578RR0001  888242484RR0001                                     HILLEL ONTARIO                   TORONTO 

In [48]:
# Update the Master Reference Table - Gift Data sheet in the Gift Data Tracking workbook with new gifting funders this year
# Might want to create a pop-up notification here to remind user to update with this value

ref_update = pd.DataFrame(masterlist, columns= ['nid', 'uuid', 'FoundationTitle', 'BN'])
new_donees = pd.DataFrame(donees, columns= ['BN'])
ref_update = pd.merge(new_donees, ref_update, on ='BN', how ='inner')
ref_update = ref_update.drop_duplicates('BN')
print(ref_update)

# *UPDATE GIFT TRACKING*
step6_Update_Tracking = dataiku.Dataset("step6_Update_Tracking")
step6_Update_Tracking.write_with_schema(ref_update)

# Don't need to update the following in Dataiku:
# *UPDATE HERE* - Update path to your own local folder, and remember to use this CSV to update the Gift Data Tracking workbook
# For non-Dataiku CSV export:
#ref_update.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\step6_Update_Tracking.csv', 
#                   encoding = 'ANSI', index = False, header=True)

                    BN     nid                                  uuid                                    FoundationTitle
0      101694578RR0001   79552  e751a107-04cc-48e4-84fb-0402ba6e869d    Eugene and Lilly Sandorfy Charitable Foundation
12     101835163RR0001   85253  d67b3bf3-e8d5-4a62-8582-20fdc1461e95      Fondation Pour La Santé du Nord de Lanaudière
13     101835221RR0001   79281  bb01d2b3-ff65-4c87-aa90-83527d2f2ca3                           Fondation Charles Cusson
24     101835445RR0001   85292  de0bf8a4-fab2-496d-8dd6-547f62a1dfc5                      Fondation Hôpital St-Eustache
25     101835486RR0001   79548  e79b0c20-c98d-4a8c-88aa-83475cfd5ad6                 Fondation Denise et Guy St-Germain
45     101835528RR0001   83064  122c8af6-03b5-40de-8834-e5228625ad85  Fondation des Amis de l'Ensemble Vocal Arts Qu...
46     101835643RR0001   88099  eee3742a-bfe1-4e96-bfbb-1ee3f84db3d9  Fondation des Paraplégiques du Québec/Quebec P...
47     101835718RR0001   87779  b77d4655

3326 rows successfully written (bSsSmJmjBY)


In [49]:
# Step 7 - Foundation Activity
# Create a subset of only foundations to work with
activity = masterlist[masterlist['FoundationCategory'] == "Foundations"]
activity = pd.DataFrame(masterlist, columns= ['BN', 'FoundationActivity'])

# Count by foundation activity
step_seven = pd.merge(donees, activity, on ='BN', how ='left')
# Want to filter so only activities that contain "Grantmaking" are left
step_seven = step_seven[step_seven['FoundationActivity'].str.contains("Grantmaking", na=False)]

# *UPDATE GIFT TRACKING*
# Update the Master Reference Table - Gift Data sheet in the Gift Data Tracking workbook with number of grantmaking foundations
counts = step_seven['FoundationActivity'].value_counts()
counts = counts.to_frame()
print(sum(counts["FoundationActivity"]))

47761


In [50]:
# Step 8
charities.rename(columns = {"BN/Registration number:": "BN"}, inplace = True)

cra_bn = charities['BN'].tolist()

# Keep only BNs that are in the Charities - All dataset
donees = donees[donees['BN'].isin(cra_bn)]

# Reset index of rows
donees.reset_index(drop=True, inplace=True)
print(donees)

                    BN          DoneeBN                                          DoneeName                      City    ReportedAmt  Year Purpose
0      101694578RR0001  119147072RR0001                      SHAAREI SHOMAYIM CONGREGATION                   TORONTO        $500.00  2019        
1      101694578RR0001  842008278RR0001                                          THE HOUSE                   TORONTO      $3,000.00  2019        
2      101694578RR0001  129950291RR0001                         NETIVOT HATORAH DAY SCHOOL                 THORNHILL      $3,000.00  2019        
3      101694578RR0001  119043891RR0001                                 MIZRACHI OF CANADA                 THORNHILL        $750.00  2019        
4      101694578RR0001  863109757RR0001                                       JACS TORONTO                   TORONTO      $1,360.00  2019        
5      101694578RR0001  888242484RR0001                                     HILLEL ONTARIO                   TORONTO        

In [51]:
# Count by foundation activity again
grantmaking_foundations = pd.merge(donees, activity, on ='BN', how ='left')
grantmaking_foundations = grantmaking_foundations[grantmaking_foundations['FoundationActivity'].str.contains("Grantmaking", na=False)]
#print(grantmaking_foundations)

# *UPDATE GIFT TRACKING*
# Update the Master Reference Table - Gift Data sheet in the Gift Data Tracking workbook with number of grantmaking foundations
counts = grantmaking_foundations['FoundationActivity'].value_counts()
counts = counts.to_frame()
print(sum(counts["FoundationActivity"]))

47761


In [52]:
# Step 9 - Check length of BNs, missing RR0001, etc.

donees['DoneeBN_len'] = donees['DoneeBN'].str.len()
print(donees)

# Just for QA'ing the code:
# bn_check = donees[donees['DoneeBN_len'] < 15]
# bn_check.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\bn_check.csv', 
#                   encoding = 'ANSI', index = False, header=True)

                    BN          DoneeBN                                          DoneeName                      City    ReportedAmt  Year Purpose  DoneeBN_len
0      101694578RR0001  119147072RR0001                      SHAAREI SHOMAYIM CONGREGATION                   TORONTO        $500.00  2019                 15.0
1      101694578RR0001  842008278RR0001                                          THE HOUSE                   TORONTO      $3,000.00  2019                 15.0
2      101694578RR0001  129950291RR0001                         NETIVOT HATORAH DAY SCHOOL                 THORNHILL      $3,000.00  2019                 15.0
3      101694578RR0001  119043891RR0001                                 MIZRACHI OF CANADA                 THORNHILL        $750.00  2019                 15.0
4      101694578RR0001  863109757RR0001                                       JACS TORONTO                   TORONTO      $1,360.00  2019                 15.0
5      101694578RR0001  888242484RR0001       

In [53]:
# Check if DoneeBNs have letters in them

#donees['contains_letters'] = donees['DoneeBN'].str.extract(pat ='([a-zA-Z])') - just for reference
donees['contains_letters'] = donees['DoneeBN'].str.findall("[a-zA-Z]")


In [54]:
# Deal with different types of BN problems
# Start with easy DoneeBNs first
# 1. 9 characters, all numeric digits -> Just add RR0001 to the end
# 2. 11 characters, ends in RR and contains_letters = ['R','R'] -> Add 0001 to the end

# Add a column that is True if the only letters in DoneeBN are RR, and False otherwise
donees['RR'] = donees['contains_letters'].apply(lambda x: x==['R', 'R'])
donees['rr'] = donees['contains_letters'].apply(lambda x: x==['r', 'r'])

# Convert all boolean to string so we can use it in our np.select below
mask = donees.applymap(type) != bool
d = {True: 'TRUE', False: 'FALSE'}
donees = donees.where(mask, donees.replace(d))

# Apply conditions using np.select
cond_12 = [(donees.DoneeBN_len == 9) & (donees.contains_letters.str.len() == 0), 
          (donees.DoneeBN_len == 11) & donees.DoneeBN.str.endswith("RR") & (donees.RR == "TRUE")]
choices_12 = [donees.DoneeBN + "RR0001", donees.DoneeBN + "0001"]

donees['DoneeBN'] = np.select(cond_12, choices_12, donees.DoneeBN)

print(donees[donees['DoneeBN_len'] == 9])

# Checkpoint - Check that RR0001 and 0001 were added correctly
# donees.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\9digits.csv', 
#                   encoding = 'ANSI', index = False, header=True)

                    BN          DoneeBN                                          DoneeName                      City  ReportedAmt  Year Purpose  DoneeBN_len contains_letters     RR     rr
137    104268586RR0001  118918804RR0001                          FIRST PRESBYTERIAN CHURCH                   PORTAGE    $3,000.00  2019                  9.0               []  FALSE  FALSE
138    104268586RR0001  118808914RR0001                            BIG BROTHER BIG SISTERS                   PORTAGE    $3,000.00  2019                  9.0               []  FALSE  FALSE
141    104268586RR0001  119307577RR0001                                                YFC                   PORTAGE    $3,000.00  2019                  9.0               []  FALSE  FALSE
142    104268586RR0001  107690885RR0001                                       MCC MANITOBA                  WINNIPEG  $308,000.00  2019                  9.0               []  FALSE  FALSE
1776   118839182RR0001  899728133RR0001                     

In [55]:
# 3. Replace rr with RR in DoneeBN where contains_letters = ['r', 'r'] in 15-character BNs

donees['DoneeBN'] = np.where((donees['rr'] == 'TRUE') & (donees['DoneeBN_len'] == 15),
                       donees['DoneeBN'].str.replace(r'\drr\d','RR', regex = True),
                       donees['DoneeBN'])

# 4. Delete: 15 characters where contains_letters != ['R', 'R'] or ['r', 'r']

cond_4 = [(donees.DoneeBN_len == 15) & (donees.RR == "FALSE") & (donees.rr == "FALSE")]
choices_4 = [""]
donees['DoneeBN'] = np.select(cond_4, choices_4, donees.DoneeBN)

print(donees[donees['DoneeBN_len'] == 15])

                    BN          DoneeBN                                          DoneeName                      City    ReportedAmt  Year Purpose  DoneeBN_len contains_letters    RR     rr
0      101694578RR0001  119147072RR0001                      SHAAREI SHOMAYIM CONGREGATION                   TORONTO        $500.00  2019                 15.0           [R, R]  TRUE  FALSE
1      101694578RR0001  842008278RR0001                                          THE HOUSE                   TORONTO      $3,000.00  2019                 15.0           [R, R]  TRUE  FALSE
2      101694578RR0001  129950291RR0001                         NETIVOT HATORAH DAY SCHOOL                 THORNHILL      $3,000.00  2019                 15.0           [R, R]  TRUE  FALSE
3      101694578RR0001  119043891RR0001                                 MIZRACHI OF CANADA                 THORNHILL        $750.00  2019                 15.0           [R, R]  TRUE  FALSE
4      101694578RR0001  863109757RR0001                

In [56]:
# 5. 14 characters
# Find and replace RR001 with RR0001, RR000 with RR0001
# Find and replace 0R0001 with 0RR0001, 1R0001 with 1RR0001, etc.
cond_5 = [(donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("RR001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("RR000")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("0R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("1R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("2R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("3R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("4R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("5R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("6R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("7R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("8R0001")),
          (donees.DoneeBN_len == 14) & (donees.DoneeBN.str.endswith("9R0001"))]
choices_5 = [donees.DoneeBN.str.replace("RR001", "RR0001"),
            donees.DoneeBN.str.replace("RR000", "RR0001"),
            donees.DoneeBN.str.replace("0R0001", "0RR0001"),
            donees.DoneeBN.str.replace("1R0001", "1RR0001"), 
            donees.DoneeBN.str.replace("2R0001", "2RR0001"),
            donees.DoneeBN.str.replace("3R0001", "3RR0001"),
            donees.DoneeBN.str.replace("4R0001", "4RR0001"),
            donees.DoneeBN.str.replace("5R0001", "5RR0001"), 
            donees.DoneeBN.str.replace("6R0001", "6RR0001"),
            donees.DoneeBN.str.replace("7R0001", "7RR0001"),
            donees.DoneeBN.str.replace("8R0001", "8RR0001"),
            donees.DoneeBN.str.replace("9R0001", "9RR0001")]

donees['DoneeBN'] = np.select(cond_5, choices_5, donees.DoneeBN)
print(donees[donees['DoneeBN_len'] == 14])

# Checkpoint
#donees.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\14digits.csv', 
#                   encoding = 'ANSI', index = False, header=True)

                    BN          DoneeBN                                          DoneeName                  City    ReportedAmt  Year Purpose  DoneeBN_len contains_letters     RR     rr
337    106885312RR0001   11914017RR0001          LES AINÉS ET LES AINÉES DE JONQUIÈRE INC.             JONQUIÈRE        $197.00  2019                 14.0           [R, R]   TRUE  FALSE
421    106885312RR0001   10745382RR0001                       GROUPE AIDE-ACTION ST-HONORÉ          SAINT-HONORÉ     $10,000.00  2019                 14.0           [R, R]   TRUE  FALSE
642    107641953RR0001   14035108RR0001                     TETRA SOCIETY OF NORTH AMERICA             VANCOUVER        $850.00  2019                 14.0           [R, R]   TRUE  FALSE
708    107641953RR0001   11924065RR0001                        LONDON PUBLIC LIBRARY BOARD                LONDON      $8,175.00  2019                 14.0           [R, R]   TRUE  FALSE
726    107641953RR0001   88909719RR0001                           L'AR

In [57]:
# 6. Where contains_letters = ["R"] and DoneeBN_len = 14, replace R with RR

# Add a column that is True if the only letter in DoneeBN is one R, and False otherwise
donees['just_R'] = donees['contains_letters'].apply(lambda x: x==['R'])

# Convert boolean to string so we can use it in our np.select below
mask = donees.applymap(type) != bool
d = {True: 'TRUE', False: 'FALSE'}
donees = donees.where(mask, donees.replace(d))

# Replace the R with RR following Condition #6
donees['DoneeBN'] = np.where((donees['just_R'] == 'TRUE') & (donees['DoneeBN_len'] == 14),
                       donees['DoneeBN'].str.replace(r'\dR\d','RR', regex = True),
                       donees['DoneeBN'])

# Checkpoint - Check that 14-character DoneeBNs with R now have RR instead 
#donees.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\replaceR.csv', 
#                   encoding = 'ANSI', index = False, header=True)

In [58]:
# Calculate contains_letters and DoneeBN_len again, now that DoneeBN has been updated
donees['DoneeBN_len'] = donees['DoneeBN'].str.len()
donees['contains_letters'] = donees['DoneeBN'].str.findall("[a-zA-Z]")

# Checkpoint - check that the new columns have the right values
#donees.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\bn_check2.csv', 
#                   encoding = 'ANSI', index = False, header=True)

In [59]:
# Calculate contains_letters and DoneeBN_len again, now that DoneeBN has been updated
donees['DoneeBN_len'] = donees['DoneeBN'].str.len()
donees['contains_letters'] = donees['DoneeBN'].str.findall("[a-zA-Z]")

# Checkpoint - check that no 15-character DoneeBNs have any letters other than RR
#donees.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\condition7.csv', 
#                   encoding = 'ANSI', index = False, header=True)

In [60]:
# 8. Delete DoneeBN where length < 15
cond_8 = [donees.DoneeBN_len < 15]
choices_8 = [""]
donees['DoneeBN'] = np.select(cond_8, choices_8, donees.DoneeBN)

In [61]:
# Valid BN check for DoneeBN
# Remove BNs that aren't in "Charities - All"
# For the Mar/21 file, this leaves 33 less valid BNs than manual cleaning (presumably because it's not picking up some more nuanced cases that can be updated manually).
# I think this is fine? We might want to try it on a bigger dataset to double check.

valid_BN = charities['BN'].tolist()

cond_9 = [~donees['DoneeBN'].isin(valid_BN)]
choices_9 = [""]
donees['DoneeBN'] = np.select(cond_9, choices_9, donees.DoneeBN)

# Calculate contains_letters and DoneeBN_len (last time), now that DoneeBN has been updated
donees['DoneeBN_len'] = donees['DoneeBN'].str.len()
donees['contains_letters'] = donees['DoneeBN'].str.findall("[a-zA-Z]")

# Checkpoint - only 15-character or empty DoneeBNs should remain
#donees.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\final_15digit.csv', 
#                   encoding = 'ANSI', index = False, header=True)
print(donees[donees['DoneeBN_len'] == 15])


                    BN          DoneeBN                                          DoneeName                      City    ReportedAmt  Year Purpose  DoneeBN_len contains_letters    RR     rr just_R
0      101694578RR0001  119147072RR0001                      SHAAREI SHOMAYIM CONGREGATION                   TORONTO        $500.00  2019                   15           [R, R]  TRUE  FALSE  FALSE
1      101694578RR0001  842008278RR0001                                          THE HOUSE                   TORONTO      $3,000.00  2019                   15           [R, R]  TRUE  FALSE  FALSE
2      101694578RR0001  129950291RR0001                         NETIVOT HATORAH DAY SCHOOL                 THORNHILL      $3,000.00  2019                   15           [R, R]  TRUE  FALSE  FALSE
3      101694578RR0001  119043891RR0001                                 MIZRACHI OF CANADA                 THORNHILL        $750.00  2019                   15           [R, R]  TRUE  FALSE  FALSE
4      101694578RR00

In [62]:
# Step 10 - Remove gift records missing both DoneeName and DoneeBN

donees.drop(donees[(donees['DoneeName'].isnull()) & (donees['DoneeBN'].isnull())].index, inplace = True)

# Check that the correct gift records were deleted
#donees.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\nonameBN.csv', 
#                   encoding = 'ANSI', index = False, header=True)

In [63]:
# Step 12 before 11 since part of it can be automated
# Just create a column that marks all rows with "see attached, qualified donee, voir liste, etc."
# Then export to csv and manually delete them based on judgement

# Add column that is TRUE if DoneeName contains "see attached, qualified donee, voir liste, etc."
# (?i) makes regex Case Insensitive
donees['bad_Name'] = donees['DoneeName'].str.contains("attached|attach|qualified donee|schedule|other|misc|list|various|voir la liste|voir liste(?i)")
# Add column that is TRUE if ReportedAmt contains any letters/words or is negative
donees['bad_Amt'] = np.logical_or(donees['ReportedAmt'].str.contains("[a-zA-Z]"), donees['ReportedAmt'].str.contains("-"))

# Convert boolean to string so we can print the dataframe here
mask = donees.applymap(type) != bool
d = {True: 'TRUE', False: 'FALSE'}
donees = donees.where(mask, donees.replace(d))

print(donees[donees['bad_Amt'] == "TRUE"])

# Delete unnecessary columns
donees.drop(['DoneeBN_len', 'contains_letters', 'RR', 'rr', 'just_R'], axis=1, inplace=True)

# Step 11 - Remove gift records with ReportedAmt < $0
# To be done manually since, in a few instances, a funder may report ALL of their gifts as a deductible/negative value.
# In that case, the gifts need to be converted to positive values.

# Upon export, remove gifts with bad_Name or bad_Amt = TRUE (if it makes sense to)
# Export to Dataiku-managed dataset
donees_cleaned = dataiku.Dataset("donees_cleaned")
donees_cleaned.write_with_schema(donees)

# Don't need to update the following in Dataiku:
# *UPDATE HERE* - Update path to your own local folder
#donees.to_csv (r'C:\Users\Catherine\Documents\Imagine Canada\Gift Import Cleaning\readyforStep11.csv', 
#                   encoding = 'ANSI', index = False, header=True)

  regex = re.compile(pat, flags=flags)


                    BN          DoneeBN                                          DoneeName       City     ReportedAmt  Year Purpose  DoneeBN_len contains_letters     RR     rr just_R bad_Name bad_Amt
9927   119240091RR0001  119278877RR0001                               UNIVERSITY OF OTTAWA        NaN      -$1,250.00  2019                   15           [R, R]   TRUE  FALSE  FALSE    FALSE    TRUE
9978   119240091RR0001  134023852RR0001  THE CANLEARN SOCIETY FOR PERSONS WITH LEARNING...        NaN      -$1,579.00  2019                   15           [R, R]   TRUE  FALSE  FALSE    FALSE    TRUE
10026  119240091RR0001                                    PARKDALE COMMUNITY HEALTH CENTRE        NaN      -$3,500.00  2019                    0               []  FALSE  FALSE  FALSE    FALSE    TRUE
10158  119240091RR0001  132806472RR0002  CIHR - CANADIAN INSTITUTES OF HEALTH RESEARCH(...        NaN     -$18,750.00  2019                   15           [R, R]   TRUE  FALSE  FALSE    FALSE    TRUE
