In [1]:
# The code will take about 1-2 minutes to compute the association rules

In [2]:
# Import the necessary libraries

In [3]:
import pandas as pd 
import numpy as np
from itertools import combinations

In [4]:
#    Generator function of all combinations based on the last state of Apriori algorithm
#    Parameters
#    -----------
#    old_combinations: np.array
#        All combinations with enough support in the last step
#        Combinations are represented by a matrix.
#        Number of columns is equal to the combination size
#        of the previous step.
#        Each row represents one combination
#        and contains item type ids in the ascending order
#        
#    Returns
#    -----------
#    Generator of all combinations from the last step x items
#    from the previous step.

In [5]:
def generate_new_combinations(old_combinations):
    
    items_types_in_previous_step = np.unique(old_combinations.flatten())
    for old_combination in old_combinations:
        max_combination = old_combination[-1]
        mask = items_types_in_previous_step > max_combination
        valid_items = items_types_in_previous_step[mask]
        old_tuple = tuple(old_combination)
        for item in valid_items:
            yield from old_tuple
            yield item

In [6]:
#    Function to get frequent itemsets from a one-hot DataFrame
#    Parameters
#    -----------
#    df : pandas DataFrame
#       The DataFrame in a one-hot encoded format 
#       from which frequent itemsets will be mined
#      
#    min_support : float
#       A float between 0 and 1 for minumum support of the itemsets returned.
#       The support is computed as the fraction
#       `transactions_where_item(s)_occur / total_transactions`.
#
#    use_colnames : bool (default: False)
#       If `True`, uses the DataFrames' column names in the returned DataFrame
#       instead of column indices.
#
#    Returns
#    -----------
#    pandas DataFrame with columns ['support', 'itemsets'] of all itemsets
#      that are >= `min_support`
#      Each itemset in the 'itemsets' column is of type `frozenset`,
#      which is a Python built-in type that behaves similarly to
#      sets except that it is immutable

In [7]:
def apriori(df, min_support, use_colnames=False):
    
    #    A private function to calculate support as the
    #    row-wise sum of values / number of rows
    #    Parameters
    #    -----------
    #      _x : matrix of bools or binary
    #      _n_rows : numeric, number of rows in _x
    #      _is_sparse : bool True if _x is sparse
    
    #   Returns
    #   -----------
    #      np.array, shape = (n_rows, )
    def _support(_x, _n_rows):
        out = (np.sum(_x, axis=0) / _n_rows)
        return np.array(out)

    X = df.values
  
    support = _support(X, X.shape[0])
    ary_col_idx = np.arange(X.shape[1])
    support_dict = {1: support[support >= min_support]}
    itemset_dict = {1: ary_col_idx[support >= min_support].reshape(-1, 1)}
    max_itemset = 1
    rows_count = float(X.shape[0])

    while max_itemset and max_itemset < (float('inf')):
        next_max_itemset = max_itemset + 1
        
        combin = generate_new_combinations(itemset_dict[max_itemset])
        combin = np.fromiter(combin, dtype=int)
        combin = combin.reshape(-1, next_max_itemset)

        if combin.size == 0:
            break

        _bools = np.all(X[:, combin], axis=2)

        support = _support(np.array(_bools), rows_count)
        _mask = (support >= min_support).reshape(-1)
        if any(_mask):
            itemset_dict[next_max_itemset] = np.array(combin[_mask])
            support_dict[next_max_itemset] = np.array(support[_mask])
            max_itemset = next_max_itemset
        else:
            # Exit condition
            break

    all_res = []
    for k in sorted(itemset_dict):
        support = pd.Series(support_dict[k])
        itemsets = pd.Series([frozenset(i) for i in itemset_dict[k]],
                             dtype='object')

        res = pd.concat((support, itemsets), axis=1)
        all_res.append(res)

    res_df = pd.concat(all_res)
    res_df.columns = ['support', 'itemsets']
    if use_colnames:
        mapping = {idx: item for idx, item in enumerate(df.columns)}
        res_df['itemsets'] = res_df['itemsets'].apply(lambda x: frozenset([
                                                      mapping[i] for i in x]))
    res_df = res_df.reset_index(drop=True)


    return res_df

In [8]:
#    Function to generate a DataFrame of association rules including the
#    metrics 'support', 'confidence', and 'lift'
#    Parameters
#    -----------
#    df : pandas DataFrame
#      pandas DataFrame of frequent itemsets
#      with columns ['support', 'itemsets']

#    metric : string (default: 'confidence')
      
#    min_threshold : float (default: 0.8)
#      Minimal threshold for the evaluation metric,
#      via the `metric` parameter,
#      to decide whether a candidate rule is of interest.

#    Returns
#    ----------
#    pandas DataFrame with columns "antecedents" and "consequents"
#      that store itemsets, plus the scoring metric columns:
#      "antecedent support", "consequent support",
#      "support", "confidence", "lift",
#      of all rules for which
#      metric(rule) >= min_threshold.
#      Each entry in the "antecedents" and "consequents" columns are
#      of type `frozenset`, which is a Python built-in type that
#      behaves similarly to sets except that it is immutable

In [9]:
def association_rules(df, metric="confidence", min_threshold=0.8):
    
    # metrics for association rules
    metric_dict = {
        "antecedent support": lambda _, sA, __: sA,
        "consequent support": lambda _, __, sC: sC,
        "support": lambda sAC, _, __: sAC,
        "confidence": lambda sAC, sA, _: sAC/sA,
        "lift": lambda sAC, sA, sC: metric_dict["confidence"](sAC, sA, sC)/sC,
        }

    columns_ordered = ["antecedent support", "consequent support",
                       "support",
                       "confidence", "lift",]


    # get dict of {frequent itemset} -> support
    keys = df['itemsets'].values
    values = df['support'].values
    frozenset_vect = np.vectorize(lambda x: frozenset(x))
    frequent_items_dict = dict(zip(frozenset_vect(keys), values))

    # prepare buckets to collect frequent rules
    rule_antecedents = []
    rule_consequents = []
    rule_supports = []

    # iterate over all frequent itemsets
    for k in frequent_items_dict.keys():
        sAC = frequent_items_dict[k]
        # to find all possible combinations
        for idx in range(len(k)-1, 0, -1):
            # of antecedent and consequent
            for c in combinations(k, r=idx):
                antecedent = frozenset(c)
                consequent = k.difference(antecedent)

                try:
                    sA = frequent_items_dict[antecedent]
                    sC = frequent_items_dict[consequent]
                except KeyError as e:
                    s = (str(e) + 'You are likely getting this error'
                                  ' because the DataFrame is missing '
                                  ' antecedent and/or consequent '
                                  ' information.')
                    raise KeyError(s)
                # check for the threshold

                score = metric_dict[metric](sAC, sA, sC)
                if score >= min_threshold:
                    rule_antecedents.append(antecedent)
                    rule_consequents.append(consequent)
                    rule_supports.append([sAC, sA, sC])

    # check if frequent rule was generated
    if not rule_supports:
        return pd.DataFrame(
            columns=["antecedents", "consequents"] + columns_ordered)

    else:
        # generate metrics
        rule_supports = np.array(rule_supports).T.astype(float)
        df_res = pd.DataFrame(
            data=list(zip(rule_antecedents, rule_consequents)),
            columns=["antecedents", "consequents"])

        
        sAC = rule_supports[0]
        sA = rule_supports[1]
        sC = rule_supports[2]
        for m in columns_ordered:
            df_res[m] = metric_dict[m](sAC, sA, sC)

        return df_res


In [10]:
# Importing the dataset of all the 12 months and concatenating them into one DataFrame

In [11]:
df = pd.read_csv('../../data/April 2018/Cleaned_Data_Set_April_2018.csv')
df2 = pd.read_csv('../../data/May 2018/Cleaned_Data_Set_May_2018.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/June 2018/Cleaned_Data_Set_June_2018.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/July 2018/Cleaned_Data_Set_July_2018.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/August 2018/Cleaned_Data_Set_August_2018.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/September 2018/Cleaned_Data_Set_September_2018.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/October 2018/Cleaned_Data_Set_October_2018.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/November 2018/Cleaned_Data_Set_November_2018.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/December 2018/Cleaned_Data_Set_December_2018.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/January 2019/Cleaned_Data_Set_January_2019.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/February 2019/Cleaned_Data_Set_February_2019.csv')
df = df.append(df2, ignore_index='True')

df2 = pd.read_csv('../../data/March 2019/Cleaned_Data_Set_March_2019.csv')
df = df.append(df2, ignore_index='True')

In [12]:
# Dropping the columns which provide no practical use to the process (Latitude, Longitude)
# and creating a list of all unique items in the remaining columns

In [13]:
df = df.drop(["Latitude", "Longitude"], axis=1)
df = df.rename(columns={"Network Type": "Network", "State Name": "State", "In Out Travelling": "In_Out_Travelling", "Call Drop Category": "Call_Drop_Category" })
df.Rating.value_counts()

items1 = (df.Operator.unique())
items2 = df.Network.unique()
items3 = df.Rating.unique()
items4 = df.State.unique()
items5 = df.Call_Drop_Category.unique()
items6 = df.In_Out_Travelling.unique()

items = np.concatenate((items1,items2,items3,items4,items5,items6))

In [14]:
# Converting the DataFrame into the One-Hot Encoded format

In [None]:
encoded_vals = []
for index, row in df.iterrows():
    labels = {}
    uncommons = list(set(items) - set(row))
    commons = list(set(items).intersection(row))
    for uc in uncommons:
        labels[uc] = 0
    for com in commons:
        labels[com] = 1
    encoded_vals.append(labels)
    
ohe_df = pd.DataFrame(encoded_vals)

In [None]:
# Finding the frequent itemsets with minimum support = 0.04

In [None]:
freq_items = apriori(ohe_df, min_support=0.04, use_colnames=True)

In [None]:
# Mining the association rules with minimum confidence = 0.8 and lift > 1.9

In [None]:
rules = association_rules(freq_items, metric="confidence", min_threshold=0.8)
rules_final = rules[rules["lift"] > 1.9]

In [None]:
# Writing the final results into a text file

In [None]:
f = open("Association Rule Mining Result.txt", "w")
i=1
for ind in rules_final.index :
    l1 = list(rules_final['antecedents'][ind])
    l2 = list(rules_final['consequents'][ind])
    
    str1 = '['        
    for ele in l1[:-1]:
        ele = str(ele)
        str1 += ele
        str1 += ', '
    
    str1 += str(l1[-1])
    str1 += ']'
    
    str2 = '['        
    for ele in l2[:-1]:
        ele = str(ele)
        str2 += ele
        str2 += ', '
    
    str2 += str(l2[-1])
    str2 += ']'
    
    supp = str(rules_final['support'][ind])
    conf = str(rules_final['confidence'][ind])
    lift = str(rules_final['lift'][ind])
    
    fstr = str(i) +'. ' + str1 + ' -> ' + str2 + '\n' + 'Support : ' + supp + ' Confidence : ' + conf + ' Lift : ' + lift + '\n'
    i = i+1
    f.write('%s\n' % fstr)
    print(l1, '->', l2)

f.close()