In [1]:
import psycopg2
import json
import itertools
import numpy as np

# more connectstr
# "dbname='sqlbook' user='postgres' host='localhost' password='xxxx'"

inputfile = open('connectstr', 'r')

connstr = None

for i in inputfile:
    connstr = json.loads(i.strip())
    
inputfile.close()

conn = psycopg2.connect(connstr)    
conn.set_session(autocommit = True)
cur = conn.cursor();

### 1 Create view for the customers and their associated products

In [2]:
sql = """CREATE OR REPLACE VIEW cust_prod as
(SELECT c.customerid, pr.productid
FROM customers c, orders o, orderlines ol, products pr
WHERE o.customerid = c.customerid
and ol.orderid = o.orderid
and pr.productid = ol.productid)"""
cur.execute(sql)

### 2 Create view for the co-occurrence matrix

In [3]:
sql2 = """CREATE OR REPLACE VIEW product_pair_count as 
(SELECT cp1.productid as firstproduct, 
cp2.productid as secondproduct, count(cp1.customerid) as paircount
FROM cust_prod cp1, cust_prod cp2
WHERE cp1.customerid = cp2.customerid
GROUP BY cp1.productid, cp2.productid
ORDER BY COUNT(cp1.customerid) DESC)"""
cur.execute(sql2)

### 3 Get non-duplicate pairs not on the diagonal

In [4]:
sql3 = """SELECT ppc.firstproduct, ppc.secondproduct, ppc.paircount
FROM product_pair_count ppc
WHERE ppc.firstproduct <  ppc.secondproduct
ORDER BY ppc.paircount DESC"""
cur.execute(sql3)

In [5]:
results = cur.fetchall()
ps1 = [r[0] for r in results]
ps2 = [r[1] for r in results]

In [6]:
for r in results:
    print r
    break

(12820, 13190, 2582L)


### 4 Create co-occurrence matrix

In [7]:
gen_matrix = np.zeros((len(ps1),len(ps2)))

print gen_matrix.shape

for r in results:
    index1 = ps1.index(r[0])
    index2 = ps2.index(r[1])
    gen_matrix[index1][index2]=r[2]

(56462, 56462)


In [8]:
# validate
gen_matrix[ps1.index(12820)][ps2.index(13190)]

2582.0