In [2]:
import pandas as pd
# load the bank transaction dataset
df = pd.read_csv('D1.csv')
# info and the first 10 transactions
print(df.info())
print(df.head(10))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1544 entries, 0 to 1543
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   patient_id  1544 non-null   object 
 1   global_num  975 non-null    float64
 2   date        1544 non-null   object 
 3   location    1544 non-null   object 
 4   latitude    1544 non-null   float64
 5   longitude   1544 non-null   float64
dtypes: float64(3), object(3)
memory usage: 72.5+ KB
None
    patient_id  global_num        date               location   latitude  \
0  P1000000501         2.0  22/04/2020  Chittorgarh_Rajasthan  24.879999   
1  P1000000501         2.0  24/04/2020  Ratnagiri_Maharashtra  16.994444   
2  P1000000502         5.0  26/04/2020     Pindwara_Rajasthan  24.794500   
3  P1000000502         5.0  27/04/2020    Raipur_Chhattisgarh  21.250000   
4  P1000000502         5.0  28/04/2020        Gokak_Karnataka  16.166700   
5  P1000000504         7.0  30/04/2020  Lucknow_Utta

In [3]:
# group by account, then list all services
transactions = df.groupby(['patient_id'])['location'].apply(list)
sequences = transactions.values.tolist()
#show the first 5 sequences
print(sequences[:5])


[['Chittorgarh_Rajasthan', 'Ratnagiri_Maharashtra'], ['Pindwara_Rajasthan', 'Raipur_Chhattisgarh', 'Gokak_Karnataka'], ['Lucknow_Uttar Pradesh'], ['Lucknow_Uttar Pradesh'], ['Delhi_Delhi']]


In [4]:
from collections import defaultdict
import subprocess
import re


def get_association_rules(sequences, min_sup, min_conf):
    # step 1: create required input for SPMF

    # prepare a dict to uniquely assign each item in the transactions to an int ID
    item_dict = defaultdict(int)
    output_dict = defaultdict(str)
    item_id = 1

    # write your sequences in SPMF format
    with open('seq_rule_input.txt', 'w+') as f:
        for sequence in sequences:
            z = []
            for itemset in sequence:
                # if there are multiple items in one itemset
                if isinstance(itemset, list):
                    for item in itemset:
                        if item not in item_dict:
                            item_dict[item] = item_id
                            item_id += 1

                        z.append(item_dict[item])
                else:
                    if itemset not in item_dict:
                        item_dict[itemset] = item_id
                        output_dict[str(item_id)] = itemset
                        item_id += 1
                    z.append(item_dict[itemset])

                # end of itemset
                z.append(-1)

            # end of a sequence
            z.append(-2)
            f.write(' '.join([str(x) for x in z]))
            f.write('\n')

    # run SPMF with supplied parameters
    supp_param = '{}%'.format(int(min_sup * 100))
    conf_param = '{}%'.format(int(min_conf * 100))
    subprocess.call(['java', '-jar', 'spmf.jar', 'run', 'RuleGrowth',
                     'seq_rule_input.txt', 'seq_rule_output.txt',
                     supp_param, conf_param], shell=True)

    # read back the output rules
    outputs = open('seq_rule_output.txt', 'r').read().strip().split('\n')
    output_rules = []
    for rule in outputs:
        left, right, sup, conf = re.search(
            pattern=r'([0-9\,]+) ==> ([0-9\,]+) #SUP: ([0-9]+) #CONF: ([0-9\.]+)', string=rule).groups()
        sup = int(sup) / len(sequences)
        conf = float(conf)
        output_rules.append([[output_dict[x] for x in left.split(',')], [
                            output_dict[x] for x in right.split(',')], sup, conf])

    # return pandas DataFrame
    return pd.DataFrame(output_rules, columns=['Left_rule', 'Right_rule', 'Support', 'Confidence'])


In [5]:
get_association_rules(sequences, 0.01, 0.1)

Usage: java [options] <mainclass> [args...]
           (to execute a class)
   or  java [options] -jar <jarfile> [args...]
           (to execute a jar file)
   or  java [options] -m <module>[/<mainclass>] [args...]
       java [options] --module <module>[/<mainclass>] [args...]
           (to execute the main class in a module)
   or  java [options] <sourcefile> [args]
           (to execute a single source-file program)

 Arguments following the main class, source file, -jar <jarfile>,
 -m or --module <module>/<mainclass> are passed as the arguments to
 main class.

 where options include:

    -cp <class search path of directories and zip/jar files>
    -classpath <class search path of directories and zip/jar files>
    --class-path <class search path of directories and zip/jar files>
                  A : separated list of directories, JAR archives,
                  and ZIP archives to search for class files.
    -p <module path>
    --module-path <module path>...
                

Unnamed: 0,Left_rule,Right_rule,Support,Confidence
0,[Chittorgarh_Rajasthan],[Ratnagiri_Maharashtra],4.751921,0.63151
1,[Chittorgarh_Rajasthan],"[Ratnagiri_Maharashtra, Pindwara_Rajasthan]",2.180022,0.289716
2,[Chittorgarh_Rajasthan],"[Ratnagiri_Maharashtra, Mumbai_Maharashtra]",1.250274,0.166156
3,[Chittorgarh_Rajasthan],"[Ratnagiri_Maharashtra, Jalpaiguri_West Bengal]",0.978046,0.129978
4,[Chittorgarh_Rajasthan],[Pindwara_Rajasthan],3.174533,0.421882
5,"[Chittorgarh_Rajasthan, Ratnagiri_Maharashtra]",[Pindwara_Rajasthan],2.180022,0.458766
6,[Chittorgarh_Rajasthan],[Raipur_Chhattisgarh],1.36663,0.181619
7,[Chittorgarh_Rajasthan],[Delhi_Delhi],0.991218,0.131729
8,[Chittorgarh_Rajasthan],[Mumbai_Maharashtra],1.840834,0.244639
9,"[Chittorgarh_Rajasthan, Ratnagiri_Maharashtra]",[Mumbai_Maharashtra],1.250274,0.263109
