## MODELING

In [1]:
import os
import sys
import numpy
import pandas
import matplotlib.pyplot as plt
sys.path.append(os.path.abspath("./util"))
from pymining import itemmining, assocrules
from fp_growth import find_frequent_itemsets as get_freq_itemset

%matplotlib inline
plt.style.use('ggplot')
pandas.set_option('display.max_rows', 10000, "display.max_columns", 100)

In [2]:
df = pandas.read_csv("./dataset/crimes_census_5poi_sampled100.csv", index_col=0, sep="\t")
df.shape

(2954, 21)

In [3]:
df = df.drop(["BLOCKID10", "Street_Nam", "ARHeteInx", "ARPerM1724", "ARPerHOwn", "ARPerSF", "ARPerRMI5L", "ARPer3MU"], axis=1)
# df = df.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)
df.describe()

Unnamed: 0,First_POI,Second_POI,Third_POI,Fourth_POI,Fifth_POI,ARPCIncome,ARPerHEdu,ARPerWork,ARPopDen,ARPerAA,NIBRSclass,Place2,Report_Dat
count,2954,2954,2953,2954,2953,2954,2954,2954,2954,2954,2954,2954,2954
unique,1386,1424,1441,1513,1522,3,3,3,3,3,52,60,36
top,WEST CHARLOTTE HIGH SCHOOL,UNIVERSITY PARK BAPTIST CHURCH,LEREODELOS VALLES PENECOSTAL,CHARLOTTE FIRE DEPT,FOOD LION,inc-1/3,edu-2/3,empl-1/3,popden-3/3,afro-3/3,Drug/Narcotic Violations,Private Residence,09-2x
freq,51,51,50,60,52,1068,1163,1084,1085,1071,100,481,122


In [4]:
transactions = [row.tolist() for i, row in df.iterrows()]

to_tuple   = lambda trans_list: tuple([tuple(trans) for trans in trans_list])
percentage = lambda x, tot=len(transactions): (x * 1.0) / tot

def only_rules_with(columns, rules, only_cons=False):
    list_of_sets = [set(df[column].values) for column in columns]
    result = []
    for rule in rules:
        current_itemset = rule[1] if only_cons else rule[0] | rule[1]
        cond = True
        for column_set in list_of_sets:
            if (current_itemset & column_set) == set():
                cond = False
        if cond:
            result.append(rule)
    return result

def print_that(filepath, rules):
    with open(filepath, 'w+') as result_file:
        for rule in rules:
            result_file.write(rule_to_string(rule))
        
def rule_to_string(rule):
    return str([j for j in rule[0]]) + "  ->  " + str([z for z in rule[1]]) + \
    ", supp: " + str(rule[2]) + \
    ", conf: " + str(rule[3]) + \
    "\n"


## Frequent Itemset generation

In [5]:
min_supp = 10

#### Relim
[Paper 1](https://pdfs.semanticscholar.org/cb3e/76d1773d08545f21daf28cc87b051604aa95.pdf)
[Paper 2](http://www.borgelt.net/papers/relim.pdf)

In [6]:
%%time
relim_input = itemmining.get_relim_input(to_tuple(transactions))
relim_itemsets = itemmining.relim(relim_input, min_support=min_supp)
print "number of frequent itemsets", len(relim_itemsets), "\n"

number of frequent itemsets 57055 

CPU times: user 1.3 s, sys: 18.9 ms, total: 1.32 s
Wall time: 1.34 s


#### FP-Growth

In [7]:
# %%time
# freq_item_generator = get_freq_itemset(transactions, min_supp, include_support=True)
# 
# fp_itemsets = {frozenset(itemset[0]): itemset[1] for itemset in freq_item_generator}
# print "number of frequent itemsets", len(fp_itemsets), "\n"

In [8]:
# pruned_itemsets = only_itemset_with("NIBRSclass", relim_itemsets)
# len(pruned_itemsets)

## Rule generation

In [9]:
min_conf = 0.60

In [10]:
%%time
rules = assocrules.mine_assoc_rules(relim_itemsets, min_support=min_supp, min_confidence=min_conf)
print len(rules)

3402981
CPU times: user 50.4 s, sys: 4.01 s, total: 54.4 s
Wall time: 56.9 s


#### Rules exploration

In [11]:
# leaving only rules that have crimes
rules_with_crimes = only_rules_with(["NIBRSclass"], rules)
rules_with_crimes_in_cons = only_rules_with(["NIBRSclass"], rules, only_cons=True)
print len(rules_with_crimes)
print len(rules_with_crimes_in_cons)

1291064
358267


In [12]:
print_that('./dataset/result_file.txt', rules_with_crimes)
print_that('./dataset/result_file_only_cons.txt', rules_with_crimes_in_cons)

In [13]:
rules_with_crimes[:1]

[(frozenset({'Affray',
             'CHARLOTTE FIRE DEPT',
             'FOOD LION',
             'LEREODELOS VALLES PENECOSTAL',
             'School - Primary or Secondary',
             'UNIVERSITY PARK BAPTIST CHURCH',
             'afro-3/3',
             'edu-2/3',
             'empl-1/3',
             'inc-2/3',
             'popden-1/3'}),
  frozenset({'WEST CHARLOTTE HIGH SCHOOL'}),
  20,
  1.0)]

In [14]:
rules_with_crimes_in_cons[:]

[(frozenset({'FDY  INC.'}),
  frozenset({'AUDIO VISUAL REPAIRS',
             'CHARLOTTE METAL FINISHING CO',
             'Disorderly Conduct',
             'IDEAL TOOL & DIE CO INC',
             'School - Primary or Secondary',
             'VOCA CORPORATION OF N.C.',
             'afro-3/3',
             'edu-3/3',
             'empl-1/3',
             'inc-1/3',
             'popden-1/3'}),
  23,
  0.696969696969697),
 (frozenset({'FDY  INC.', 'VOCA CORPORATION OF N.C.'}),
  frozenset({'AUDIO VISUAL REPAIRS',
             'CHARLOTTE METAL FINISHING CO',
             'Disorderly Conduct',
             'IDEAL TOOL & DIE CO INC',
             'School - Primary or Secondary',
             'afro-3/3',
             'edu-3/3',
             'empl-1/3',
             'inc-1/3',
             'popden-1/3'}),
  23,
  0.696969696969697),
 (frozenset({'VOCA CORPORATION OF N.C.'}),
  frozenset({'AUDIO VISUAL REPAIRS',
             'CHARLOTTE METAL FINISHING CO',
             'Disorderly Conduct',

---

### Entire dataset

In [15]:
df_entire = pandas.read_csv("./dataset/crimes_census_5poi.csv", index_col=0, sep="\t")
df_entire = df_entire.drop(["BLOCKID10", "Street_Nam", "ARHeteInx", "ARPerHOwn", 
                            "ARPerRMI5L", "ARPer3MU"], axis=1)
df_entire = df_entire.drop(["First_POI", "Second_POI", "Third_POI", "Fourth_POI", "Fifth_POI"], axis=1)

transactions_entire = [row.tolist() for i, row in df_entire.iterrows()]
df_entire.shape

(19106, 10)

In [16]:
%time
min_supp = 70
relim_input_entire = itemmining.get_relim_input(to_tuple(transactions_entire))
relim_itemsets_entire = itemmining.relim(relim_input_entire, min_support=min_supp)
print "number of frequent itemsets", len(relim_itemsets_entire), "\n"

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.01 µs
number of frequent itemsets 24470 



In [17]:
min_conf = 0.60
rules_entire = assocrules.mine_assoc_rules(relim_itemsets_entire, min_support=min_supp, min_confidence=min_conf)
print len(rules_entire)

71162


In [18]:
rules_with_crimes_entire = only_rules_with(["NIBRSclass"], rules_entire)
rules_with_crimes_in_cons_entire = only_rules_with(["NIBRSclass"], rules_entire, only_cons=True)
print len(rules_with_crimes_entire)
print len(rules_with_crimes_in_cons_entire)

22921
1468


In [19]:
rules_with_crimes_in_cons_entire[:1]

[(frozenset({'Department Store',
             'edu-2/3',
             'empl-3/3',
             'inc-3/3',
             'popden-1/3',
             'sinpar-1/3',
             'youngm-3/3'}),
  frozenset({'Shoplifting', 'afro-1/3'}),
  78,
  0.8387096774193549)]

---
### Evaluation

In [20]:
def lift(rule_supp, ant_supp, cons_supp):
    return (rule_supp / 1.0) / (ant_supp * cons_supp)

def conviction(cons_supp, rule_conf):
    return (1.0 - cons_supp) / (1.0 - rule_conf)

In [None]:
for rule in rules:
    l = lift(relim_itemsets[rule[0] | rule[1]], relim_itemsets[rule[0]], relim_itemsets[rule[1]])
    if l > 1:
        print l
    # c = conviction(relim_itemsets[rule[1]], rule[3])