In [1]:
import pandas as pd
import numpy as np 
import warnings
import re
import seaborn as sns
import matplotlib.pyplot as plt


warnings.filterwarnings("ignore")

## Extract and Concat data files

In [2]:
def find_last_valid_record(df):
    '''Find the first row where all columns are null, and return the row previous to that.'''

    last_record_idx = df.isnull().apply(lambda x: all(x), axis=1).idxmax() - 1
    
    return last_record_idx


def process_year_donations(main_fname: str, year:str, colnames: list):
    '''Process data from each year and assign standard structure.'''

    df = pd.read_excel(main_fname, sheet_name = year)

    # we deduce if the column headers are in the first or second row. If necessary, we correct the read of the file
    c = 0
    for i in df.columns[:20]:
        if 'Unnamed:' in str(i):
            # if too many Unnamed columns, then we have to skip the first row
            c += 1
    if c > 8 :
        df = pd.read_excel(main_fname, sheet_name = year, header = 1)

    last_record_idx = find_last_valid_record(df) # keep only rows containing year data
    df = df.iloc[:last_record_idx]

    df = df[df.columns & colnames].copy()

    if 'Prize/Donation details' in df.columns:
        df['Donation details'] = df['Prize/Donation details']
    else:
        df['Donation details'] = df['Sponsorship level']
    
    df = df.drop(columns = ['Sponsorship level'	,'Prize/Donation details'], errors = 'ignore')
    df = df.rename(columns = {'Actual \n$ Amount':'$ Amount'})
    df = df.reset_index(drop=True)

    return df


In [4]:
fname = "../Data/corporate sponsors 2022.xlsx"
#df = pd.read_excel(fname, sheet_name = '2010')

colnames = [ 'Added/   Modified', 	'Company Name',	'Sponsorship level'	,
             '$ Amount', 'Actual \n$ Amount',	'Money/Prize Received',  'Prize/Donation details' ]

years = [str(i) for i in range(2010,2023,1)]

df_concat = process_year_donations(fname, years[0], colnames)[:0] # we just take the structure

for y in years:
    df_ = process_year_donations(fname, y, colnames)
    print(y , df_.shape )
    df_['Year'] = y 
    df_concat = pd.concat( [df_concat, df_])

df_concat = df_concat.reset_index(drop=True)

2010 (80, 5)
2011 (86, 5)
2012 (84, 5)
2013 (93, 5)
2014 (140, 5)
2015 (149, 5)
2016 (133, 5)
2017 (103, 5)
2018 (77, 5)
2019 (56, 5)
2020 (29, 5)
2021 (33, 5)
2022 (15, 5)


## Data Cleansing

### Cleaning Column Dates

In [5]:
cols = df_concat.columns.to_list()
new_cols=[]
for col in cols:
    
    col = re.sub(' +', ' ', col)
    col = col.replace(' ','_')
    col = col.replace('/','')
    col = col.replace('$','dollar')
    col = col.lower()
    new_cols.append(col)

df_concat.columns = new_cols

### Date

In [6]:
df_concat.added_modified = df_concat.added_modified.astype(str)

In [7]:
# we find the dates that have a standard format
date_format_flg = (df_concat.added_modified.str[4:5] == '-') & (df_concat.added_modified.str[7:8] == '-')

In [8]:
#2010 dates need reformatting as they don't include the year
df_concat['2010_day'] = df_concat[date_format_flg & (df_concat.year=='2010')].added_modified.str[2:4]
df_concat['2010_month'] = df_concat[date_format_flg & (df_concat.year=='2010')].added_modified.str[5:7]

df_concat.loc[date_format_flg & (df_concat.year=='2010'), 'added_modified'] = \
df_concat[date_format_flg & (df_concat.year=='2010')]['year'] + '-'\
+ df_concat[date_format_flg & (df_concat.year=='2010')]['2010_month'] + '-' \
+ df_concat[date_format_flg & (df_concat.year=='2010')]['2010_day'] 

df_concat = df_concat.drop(columns= [ '2010_day', '2010_month'])

df_concat['added_modified'] = pd.to_datetime(df_concat.added_modified, errors = 'coerce')


In [9]:
# clean some rows that are all empty
df_concat = df_concat [ ~df_concat[['company_name', 'dollar_amount','moneyprize_received','donation_details']].isna().all(1) ].copy()

### Dollar amount

In [10]:
df_concat['dollar_amount_float'] = pd.to_numeric(df_concat['dollar_amount'], errors='coerce')

In [11]:
n_coerced = df_concat.loc[df_concat.dollar_amount_float != df_concat.dollar_amount, ['dollar_amount_float','dollar_amount']].shape[0]
print(n_coerced, 'rows were coerced as Null for not having convertable values')

df_concat.dollar_amount_float = df_concat.dollar_amount_float.fillna(0)

55 rows were coerced as Null for not having convertable values


### Company name

In [12]:
df_companies = df_concat.company_name.value_counts().reset_index().rename(columns = {'index':'company', 'company_name':'q_rows'})

In [13]:
df_companies['company'] = df_companies['company'].astype(str)

In [15]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import unidecode

stopwords = stopwords.words('french') + stopwords.words('english') + ['/' ,'-', 'pour', 'ou', 'du', 'la', 'ou', 'de',
         'avec', 'par', 'depuis','bien','a', 'd\'un', '']

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.upper()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    cleantext = unidecode.unidecode(cleantext)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 0 if not w in stopwords] #french stopwords
    return " ".join(filtered_words)
    

In [16]:
df_companies['company_v2']= df_companies['company'].apply(lambda x: preprocess(x))


In [17]:
df_companies = df_companies[df_companies.company_v2 != '']

In [19]:
# read CSV with company name pairs that should be unified

similar_companies = pd.read_csv('../Data/similar_companies.csv', encoding = 'utf-8')
similar_companies = similar_companies[ similar_companies.iloc[:,2] == 1 ].iloc[:,:2]
similar_companies['lens'] =  similar_companies.apply(lambda x:  [len(x[0]) , len(x[1]) ] , axis = 1)

similar_companies['id_shortest_name'] = similar_companies.lens.apply(lambda x: x.index(min(x))) # choose which name to keep as main

similar_companies['shortest_name'] = \
    similar_companies.apply(lambda x:  x[0] if x['id_shortest_name'] == 0 else x[1], axis=1)

similar_companies = similar_companies.iloc[:,[0,1,4]]
similar_companies.columns = ['name_a', 'name_b', 'shortest_name']

In [20]:
# propagate the main name back to the main df

df_companies = df_companies.merge(similar_companies[['name_a','shortest_name']],
         how='left',
         left_on = 'company_v2', 
         right_on='name_a')

df_companies = df_companies.merge(similar_companies[['name_b','shortest_name']],
         how='left',
         left_on = 'company_v2', 
         right_on='name_b')

In [21]:
df_companies['company_v3'] = df_companies['company_v2'].copy()

df_companies.loc[ df_companies.shortest_name_x.notna(), 'company_v3'] =  df_companies.shortest_name_x

df_companies.loc[ df_companies.shortest_name_y.notna(), 'company_v3'] =  df_companies.shortest_name_y

df_companies = df_companies[['company', 'company_v3']].drop_duplicates()

In [22]:
# add the company names back to the main df_concat

df_concat = df_concat.merge(df_companies, how = 'left', left_on='company_name', right_on = 'company')

In [23]:
df = df_concat[['company_v3', 'added_modified', 'dollar_amount_float','moneyprize_received', 'donation_details', 'year']]

df = df.rename(columns = {'company_v3': 'company'})

### Donation Details

In [24]:
df.donation_details.value_counts().head(20)

$200 1/4 page ad                                58
business card                                   53
$500 donation                                   44
$100 business card                              34
business card ad                                33
Corporate team                                  31
$1000 donation                                  22
$300 1/2 page ad                                21
1/4 page ad                                     19
Silver                                          18
K4K Friend                                      15
1/2 page ad                                     14
$100 Busines card                               11
Thank you page                                   9
$500 K4K friend                                  8
reduced accounting fees                          8
Roundtrip for two to any WestJet destination     8
Longtime sponsor                                 7
$500 Silver Sponsor                              7
thank you sponsor              

In [25]:
df['don_detail_amount'] = df['donation_details'].str.extract('(\$[0-9,.]+)')
df['don_detail_amount'] = pd.to_numeric(df['don_detail_amount'].str.replace('$', '').str.replace(',', ''))

# lets create a column that joins the amount information from donation or equivalents
df['dollar_equivalent_amount'] = df.dollar_amount_float
df.loc[ (df.dollar_amount_float == 0) & (df.don_detail_amount > 0)  ,['dollar_equivalent_amount'] ] = df.don_detail_amount

# df[['dollar_equivalent_amount', 'dollar_amount_float']].describe() we see the new column has more info


In [26]:
# clean donation detail of any amounts, just keep the description in a new col
df['don_detail_txt']=df['donation_details'].str.replace('(\$[0-9,.]+)','', n=1)
df['don_detail_txt'].replace(r'\s+',' ', regex=True, inplace=True)
df['don_detail_txt'] = df['don_detail_txt'].str.lower()
df['don_detail_txt'] = df['don_detail_txt'].str.strip()

### Save CSV

In [45]:
df.to_csv('past_donors_clean.csv', index=None)
df.to_pickle('past_donors_clean.pickle')