## Test the various functions of the acct flow process
### codebase: account_flows.ipynb & acctFlow_functions.py

In [2]:
import os
import sys

sys.path.append('/home/mofongo/Documents/ghfc/membershipReportsCIVI/greeneHill/acctFlows')

import acctFlow_functions

In [3]:
import pandas as pd
import numpy as np
import unittest
import re
import datetime
#edit to the path of the `container_credentials` module
sys.path.append(('/home/mofongo/Documents/ghfc/membershipReportsCIVI/greeneHill'))
from container_credentials import return_credentials
from sqlalchemy import create_engine, Table, text, MetaData # a CORE approach
from sqlalchemy.sql import select
from sqlalchemy.sql import func

In [4]:
twentythree = [acctFlow_functions.last_day_of_month(datetime.date(2023, month, 7)) for month in range(1, 13)]
twentyfour = [acctFlow_functions.last_day_of_month(datetime.date(2024, month, 7)) for month in range(1, 13)] 
twentyfive = [acctFlow_functions.last_day_of_month(datetime.date(2025, month, 7)) for month in range(1, 13)] 

month_end_df = pd.DataFrame({'twentyfour':twentyfour,'year':2024})

#make a replacement column for null values (expect it will only occur above the January record)
month_end_df['prev_yr']=month_end_df.apply(lambda x: str(x['year']-1)+'-12-31', axis = 1)

month_end_shifted = month_end_df.shift(periods=[0,1],axis = 0)

#BE SURE TO CHANGE THE FIELD NAMES TO REFLECT THE MEASURE YEAR
month_end_shifted['twentyfour_1'] = month_end_shifted['twentyfour_1'].fillna(month_end_shifted['prev_yr_0'])
month_end_shifted = month_end_shifted.iloc[:,:].astype(str)

# a list of tuples, ea of which provides the dates combo for the query
#BE SURE TO CHANGE THE FIELD NAMES TO REFLECT THE MEASURE YEAR
calendar_range_iter = [tuple(i.values()) for i in month_end_shifted[['twentyfour_1','twentyfour_0']].to_dict('records')]

In [9]:
# DEFINE THE DATABASE CREDENTIALS

cred_dict = return_credentials()

user = cred_dict['user'] 
password = cred_dict['pass'] 
host = cred_dict['host'] 
port = cred_dict['port'] 
database = cred_dict['database']


def get_connection():
	return create_engine(
		url="mysql+pymysql://{0}:{1}@{2}:{3}/{4}".format(
			user, password, host, port, database
		)
	)

if __name__ == '__main__':

	try:
	
		# GET THE CONNECTION OBJECT (ENGINE) FOR THE DATABASE
		# working w/engines: https://docs.sqlalchemy.org/en/20/core/engines_connections.html
		engine = get_connection() #engine should be created just once, and can manage several DBAPI connections
		print(
			f"Connection to the {host} for user {user} created successfully.")
	except Exception as ex:
		print("Connection could not be made due to the following error: \n", ex)

Connection to the 172.17.0.2 for user root created successfully.


In [11]:
#this query simply returns two snapshots: period 1 AND period 2 activity types, along with freq; ex. curr_activity_calc = 'general leave' prev_activity_calc = 'initial enrollment'
def fill_query(tup_dates:tuple):
    return "WITH curr AS (select mt_email curr_mt_email, mem_type curr_mem_type, activity_calc curr_activity_calc, activity curr_activity from stack_job2 WHERE date('"+tup_dates[1]+"') between start_dt AND lead_date ORDER BY mt_email), prev AS (select mt_email prev_mt_email, mem_type prev_mem_type, activity_calc prev_activity_calc, activity prev_activity from stack_job2 WHERE date('"+tup_dates[0]+"') between start_dt AND lead_date ORDER BY mt_email), final_tbl AS (SELECT date('"+tup_dates[1]+"') current_month, curr_activity_calc, prev_activity_calc, count(distinct curr_mt_email) unq_email FROM curr LEFT JOIN prev ON curr_mt_email = prev_mt_email GROUP BY 1,2,3) SELECT * FROM final_tbl"

In [15]:
# a dictionary of the SQL QUERY STRING (the PREPARED STATEMENT), with key = representative month
#dict: key = latest date of the month tuple; value = sql query text
query_cont = {}
for min_tup in calendar_range_iter:
    query_cont[min_tup[1]]=fill_query(min_tup)

In [18]:
#for the TEST VERSION: extract ONE result set, and not the entire calendar year as done in the prod version
#build a dictionary of the result sets, setting the key to the month (starting w/Jan)
#value = a dataframe of a freq table of the combos of curr & prev activity_calc field values; fields 'current_month', 'curr_activity_calc', 'prev_activity_calc', 'unq_email'
with engine.connect() as conn:
    #for k,query in query_cont.items():
    end_of_june = pd.read_sql(query_cont['2024-06-30'],conn)
    end_of_may = pd.read_sql(query_cont['2024-05-31'],conn)

In [20]:
#ONE SUCH RRESULTSET
end_of_may.head()

Unnamed: 0,current_month,curr_activity_calc,prev_activity_calc,unq_email
0,2024-05-31,cancelled,,1
1,2024-05-31,cancelled,cancelled,289
2,2024-05-31,cancelled,initial enrollment,2
3,2024-05-31,cancelled,technical activation,1
4,2024-05-31,care giving leave,care giving leave,1


In [22]:
acctFlow_functions.package_scores_vector(end_of_may)

combo
(cancelled, None)                                      1
(cancelled, cancelled)                               289
(cancelled, initial enrollment)                        2
(cancelled, technical activation)                      1
(care giving leave, care giving leave)                 1
(deactivate, deactivate)                            2007
(deactivated, deactivated)                           156
(deactivated, initial enrollment)                      5
(general leave, general leave)                       260
(general leave, initial enrollment)                    3
(general leave, technical reactivation)                1
(general leave, winback)                               3
(initial enrollment, None)                            11
(initial enrollment, initial enrollment)             245
(initial enrollment, winback)                          1
(medical leave, medical leave)                         9
(parental leave, parental leave)                       2
(suspended, suspended)   

In [None]:
#building the Test Class
'''
1) ensure that package_scores_vector function has length equal to the length of the month's result set from the db
'''

In [24]:
#best practice is to test one function at a time
class TestPackageScoresVector(unittest.TestCase):
    def setUp(self):
        
    def test_firstTest(self):
        #test something
        df_shape = end_of_may.shape
        vec_shape = acctFlow_functions.package_scores_vector(end_of_may).shape
        self.assertEqual(df_shape[0],vec_shape[0])

In [25]:
if __name__ == '__main__':
    unittest.main(argv=[''], verbosity=2, exit=False)

test_firstTest (__main__.TestPackageScoresVector.test_firstTest) ... ok

----------------------------------------------------------------------
Ran 1 test in 0.004s

OK


## Testing import_treat_cat_df
Ensure that matching between this categorical DF is effective against the scores series vector, esp in cases of NULL

In [5]:
cat_df = acctFlow_functions.import_treat_cat_df(("statusCombinationMatrix_acctFlows.ods"))
test_df = cat_df.apply(sum,axis=1)
test_df[test_df==0]

tuple_index
(cancelled, cancelled)          0.0
(cancelled, None)               0.0
(deactivate, deactivate)        0.0
(deactivated, deactivated)      0.0
(suspended, suspended)          0.0
(cancelled, deactivated)        0.0
(deactivated, medical leave)    0.0
(cancelled, medical leave)      0.0
(deactivated, suspended)        0.0
(deactivated, None)             0.0
dtype: float64

Ensure that there are no duplicates in the matrix

In [8]:
from collections import Counter

# Count occurrences of each tuple
counts = Counter(list(cat_df.index))

# Extract duplicates (those with a count > 1)
duplicates = [item for item, count in counts.items() if count > 1]

print(duplicates)

[]


## Testing sync_objects

In [151]:
cat_df = acctFlow_functions.import_treat_cat_df(("statusCombinationMatrix_acctFlows.ods"))
may = acctFlow_functions.package_scores_vector(end_of_may)
june = acctFlow_functions.package_scores_vector(end_of_june)
scores_series_vector_prod, cats_df2, outer = acctFlow_functions.sync_objects(june, cat_df)

In [152]:
print(f"the length of scores_series_vector_prod is {len(scores_series_vector_prod)}")
print(f'the length of june is {len(june)}')
print(f"The 'outer' variable is {outer}")

the length of scores_series_vector_prod is 25
the length of june is 25
The 'outer' variable is []


In [108]:
#list(may.index)[list(may.index) not in list(scores_series_vector_prod.index)]
list(june.index)[list(june.index) not in list(scores_series_vector_prod.index)]

('cancelled', 'initial enrollment')

In [139]:
from itertools import compress

bool_list = [item not in list(june.index) for item in list(cat_df.index)]
list(compress(list(june.index), bool_list))

#print(sum(bool_list))

[('cancelled', 'winback'),
 ('deactivate', 'deactivate'),
 ('initial enrollment', 'initial enrollment'),
 ('initial enrollment', 'winback'),
 ('medical leave', 'initial enrollment'),
 ('suspended', 'initial enrollment'),
 ('technical reactivation', 'suspended'),
 ('technical reactivation', 'technical reactivation'),
 ('technical reactivation', 'winback'),
 ('winback', 'initial enrollment'),
 ('winback', 'winback')]

In [131]:
len(scores_series_vector_prod.index)

27

In [132]:
('winback', 'suspended') in list(cat_df.index)

True

In [106]:
#these lengths NEED TO EQUAL
if len(june) != len(list(set(list(scores_series_vector_prod.index)) & set(list(june.index)))):
    print(len(june))
    print((len(scores_series_vector_prod.index)))

25
27


## Test apply_multiply

In [154]:
cat_df = acctFlow_functions.import_treat_cat_df(("statusCombinationMatrix_acctFlows.ods"))
may = acctFlow_functions.package_scores_vector(end_of_may)
june = acctFlow_functions.package_scores_vector(end_of_june)

In [156]:
#inherently this is April to May
scores_series_vector_prod, cats_df2, outer = acctFlow_functions.sync_objects(may, cat_df)

monthlies_dict=acctFlow_functions.apply_multiply(scores_series_vector_prod, cats_df2)

monthlies_dict

leave                   279.0
active                  310.0
winback                  41.0
new_cancel/suspended      9.0
trial_conversion          0.0
new_signup               12.0
new_leave                 7.0
new_winback               2.0
re-activation             0.0
dtype: float64

In [158]:
#inherently May to June
scores_series_vector_prod, cats_df2, outer = acctFlow_functions.sync_objects(june, cat_df)

monthlies_dict=acctFlow_functions.apply_multiply(scores_series_vector_prod, cats_df2)

monthlies_dict

leave                   287.0
active                  308.0
winback                  40.0
new_cancel/suspended     10.0
trial_conversion          0.0
new_signup               15.0
new_leave                 8.0
new_winback               1.0
re-activation             0.0
dtype: float64

In [93]:
class TestObjectImportTreatCatDF(unittest.TestCase):
    #pausing this test... although useful, I discover that some rows should indeed by equal to 0
    @unittest.skip('not necessary that the rows lengths of the two vectors be equal')
    def test_non_zero_rows(self):
        #test that ea row is non-zero
        cat_df = acctFlow_functions.import_treat_cat_df(("statusCombinationMatrix_acctFlows.ods"))
        cat_df.apply(sum,axis=1)
        self.assertEqual(df_shape[0],vec_shape[0])
        return None
    
    def test_proper_subsetting(self):
        #test the membership of the index from a sample month be contained in the index for scores_series_vector_prod
        cat_df = acctFlow_functions.import_treat_cat_df(("statusCombinationMatrix_acctFlows.ods"))
        may = acctFlow_functions.package_scores_vector(end_of_may)
        june = acctFlow_functions.package_scores_vector(end_of_june)
        scores_series_vector_prod, cats_df2, outer = acctFlow_functions.sync_objects(may, cat_df)
        self.assertIn(list(may.index), list(scores_series_vector_prod.index))
        #self.assertEqual(cats_df2,cat_df)

In [94]:
if __name__ == '__main__':
    unittest.main(argv=[''], verbosity=2, exit=False)

test_non_zero_rows (__main__.TestObjectImportTreatCatDF.test_non_zero_rows) ... skipped 'not necessary that the rows lengths of the two vectors be equal'
test_proper_subsetting (__main__.TestObjectImportTreatCatDF.test_proper_subsetting) ... FAIL
test_firstTest (__main__.TestPackageScoresVector.test_firstTest) ... ok

FAIL: test_proper_subsetting (__main__.TestObjectImportTreatCatDF.test_proper_subsetting)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "/tmp/ipykernel_38387/2478193819.py", line 17, in test_proper_subsetting
    self.assertIn(list(may.index), list(scores_series_vector_prod.index))
AssertionError: [('cancelled', 'None'), ('cancelled', 'cancelled'), ('cancelled', 'initial enrollment'), ('cancelled', 'technical activation'), ('care giving leave', 'care giving leave'), ('deactivate', 'deactivate'), ('deactivated', 'deactivated'), ('deactivated', 'initial enrollment'), ('general leave', 'general leave'), ('gen