In [219]:
import sqlite3

In [220]:
# %%bash

# rm "test.db"

In [221]:
# %%bash

# # if [ ! -e "test.db" ]; then
#     # Run this command TWICE to create and prepopulate the table.
#     sqlite3 test.db < data/fc_project_tags.sql
# # else
# #     echo 'file exist'
# # fi

In [222]:
conn = sqlite3.connect('test.db')
c = conn.cursor()

In [223]:
c.execute('''
    CREATE TABLE IF NOT EXISTS fc_project_tag_pairs (
        tag1 varchar(255) NOT NULL,
        tag2 varchar(255) NOT NULL,
        num_projs int(11) NOT NULL
    )
''').fetchone()

c.execute('''
    CREATE UNIQUE INDEX IF NOT EXISTS idx_fc_project_tag_pairs_tags ON fc_project_tag_pairs(tag1, tag2)
''').fetchone()

In [224]:
c.execute('''
    CREATE TABLE IF NOT EXISTS fc_project_tag_triples (
        tag1 varchar(255) NOT NULL,
        tag2 varchar(255) NOT NULL,
        tag3 varchar(255) NOT NULL,
        num_projs int(11) NOT NULL
    )
''').fetchone()


c.execute('''
    CREATE UNIQUE INDEX IF NOT EXISTS idx_fc_project_tag_triples_tags ON fc_project_tag_triples(tag1, tag2, tag3)
''').fetchone()

In [225]:
c.execute('SELECT COUNT(*) FROM fc_project_tags').fetchone()

(353400,)

In [226]:
c.execute('SELECT COUNT(DISTINCT(tag_name)) FROM fc_project_tags').fetchone()

(11006,)

In [227]:
c.execute('SELECT COUNT(DISTINCT(project_id)) FROM fc_project_tags').fetchone()

(46510,)

In [228]:
# The table has two columns - project_id and tag_name;
stmt = '''
    SELECT tag_name, COUNT(*) AS total 
    FROM fc_project_tags 
    GROUP BY tag_name 
    ORDER BY total desc 
    LIMIT 10
'''
res = c.execute(stmt).fetchall()
for tag, count in res:
    print(f'{tag:30}:{count:>5}')

GPL                           :21182
POSIX                         :16875
Linux                         :16288
C                             :10292
OS Independent                :10180
Software Development          : 9619
Internet                      : 8100
Windows                       : 7572
Java                          : 6394
Web                           : 6267


In [229]:
stmt = '''
    SELECT project_id, COUNT(*) AS total, GROUP_CONCAT(tag_name)
    FROM fc_project_tags 
    GROUP BY project_id 
    ORDER BY total DESC 
    LIMIT 10
'''
res = conn.execute(stmt).fetchall()
for project_id, count, tags in res:
    print(project_id, count, tags.split(',')[:3])

79820 144 ['Abkhazian', 'Afar', 'Afrikaans']
78747 143 ['AJAX', 'Abkhazian', 'Afar']
81769 134 ['Afrikaans', 'Akan', 'Albanian']
37224 77 ['Afrikaans', 'Albanian', 'Amharic']
55294 71 ['Afrikaans', 'Albanian', 'Arabic']
55599 68 ['Arabic', 'Armenian', 'Asturian']
39957 66 ['Address Book', 'Albanian', 'Arabic']
39 64 ['Apache HTTP Server', 'Arabic', 'Bahasa Indonesia']
27303 63 ['AGPL v3', 'AIX', 'Arab (Saudi Arabia)']
80680 63 ['AS/400', 'Android', 'BlackBerry']


In [230]:
# Set percentage threshold for min support.
MIN_SUPPORT_PCT = 5

In [231]:
basket_count_stmt = '''
    SELECT COUNT(DISTINCT(project_id))
    FROM fc_project_tags
'''
basket_count = c.execute(basket_count_stmt).fetchone()[0]
basket_count

46510

In [232]:
min_support = MIN_SUPPORT_PCT / 100 * basket_count
f'Min support is {min_support}'

'Min support is 2325.5'

In [233]:
# Get the tags that meets the minimum support.
get_tags_stmt = '''
    SELECT DISTINCT tag_name
    FROM fc_project_tags
    GROUP BY tag_name
    HAVING COUNT (project_id) >= ?
    ORDER BY tag_name
'''
res = c.execute(get_tags_stmt, (min_support,)).fetchall()
len(res)

29

In [234]:
singletons = [row[0] for row in res]
f'Found {len(singletons)} singletons: {singletons}'

"Found 29 singletons: ['BSD', 'C', 'C++', 'Communications', 'Desktop Environment', 'Dynamic Content', 'English', 'GPL', 'GPLv3', 'Games/Entertainment', 'Internet', 'Java', 'LGPL', 'Libraries', 'Linux', 'Mac OS X', 'Networking', 'OS Independent', 'PHP', 'POSIX', 'Perl', 'Python', 'Scientific/Engineering', 'Software Development', 'Unix', 'Utilities', 'Web', 'Windows', 'multimedia']"

In [235]:
a = [1,2,3,4,5]

def combination(a, k = 2):
    result = []

    def iterate(a, b):
        for i, a_i in enumerate(a):
            c = b + [a_i]
            if len(c) == k:
                result.append(c)
            if len(c) < k:
                iterate(a[i+1:], b + [a_i])
    iterate(a, [])

    return result

# This produces the same results as itertools.combinations
# import itertools
# list(itertools.combinations(a, 3))
combination(a, 3)

[[1, 2, 3],
 [1, 2, 4],
 [1, 2, 5],
 [1, 3, 4],
 [1, 3, 5],
 [1, 4, 5],
 [2, 3, 4],
 [2, 3, 5],
 [2, 4, 5],
 [3, 4, 5]]

In [236]:
doubletons = set()
doubletons_set = set()

def find_doubletons():
    # Use the list of singletons to generate the doubletons candidate.
    doubleton_candidates = combination(singletons, 2)
    
    c = conn.cursor()
    for index, candidate in enumerate(doubleton_candidates):
        # Check if the doubleton candidate is frequent.
        tag1, tag2 = candidate
        
        res = c.execute('''
            SELECT COUNT(fpt1.project_id)
            FROM fc_project_tags fpt1
            INNER JOIN fc_project_tags fpt2 
            ON (fpt1.project_id = fpt2.project_id)
            WHERE fpt1.tag_name = ?
            AND fpt2.tag_name = ?
        ''', (tag1, tag2)).fetchone()
        
        count = res[0]
        
        # Add frequet doubleton to database.
        if count > min_support:
            print(f'{tag1}, {tag2}, [{count}]')
            
            try:
                c.execute('''
                    INSERT INTO fc_project_tag_pairs(tag1, tag2, num_projs)
                    VALUES (?, ?, ?)
                ''', (tag1, tag2, count))
            except sqlite3.IntegrityError as e:
                pass
            
            # Save the frequent doubleton to our final list.
            doubletons_set.add((tag1, tag2))
            
            # Add terms to a set of all doubleton terms (no duplicates).
            doubletons.add(tag1)
            doubletons.add(tag2)
            
    # Persist to database.
    conn.commit()

In [237]:
if len(doubletons) == 0:
    find_doubletons()

C, GPL, [5543]
C, Linux, [5653]
C, POSIX, [6956]
C++, GPL, [2914]
C++, Linux, [3428]
C++, POSIX, [3502]
Communications, GPL, [2578]
Dynamic Content, Internet, [3173]
Dynamic Content, Web, [3171]
English, Linux, [2662]
GPL, Internet, [4038]
GPL, Linux, [8038]
GPL, OS Independent, [4405]
GPL, PHP, [2376]
GPL, POSIX, [10069]
GPL, Software Development, [3319]
GPL, Web, [2901]
GPL, Windows, [2605]
GPL, multimedia, [2883]
Internet, OS Independent, [3007]
Internet, POSIX, [2832]
Internet, Web, [5978]
Java, OS Independent, [3436]
Java, Software Development, [2360]
Libraries, Software Development, [5638]
Linux, Mac OS X, [2974]
Linux, POSIX, [11903]
Linux, Software Development, [2336]
Linux, Unix, [2494]
Linux, Windows, [5281]
Mac OS X, Windows, [3131]
OS Independent, Software Development, [3566]
OS Independent, Web, [2605]
POSIX, Software Development, [3503]
POSIX, Unix, [2326]
POSIX, Windows, [4467]
POSIX, multimedia, [2539]


In [238]:
def find_tripletons():
    # Use the list of doubletons to make the tripletons candidates.
    tripleton_candidates = combination(list(doubletons), 3)

    # Sort each candidate tuple and add these to a new sorted candidate list.
    tripleton_candidates_sorted = []
    for tc in tripleton_candidates:
        tripleton_candidates_sorted.append(sorted(tc))
    
    # Prepare cursor.
    c = conn.cursor()
    
    # Figure out if this candidate is frequent.
    for index, candidate in enumerate(tripleton_candidates_sorted):
        # All doubletons insude this tripleton candidate MUST also be frequent.
        doubletons_inside_tripletons = combination(candidate, 2)
        
        tripleton_candidate_rejected = 0
        for index, doubleton in enumerate(doubletons_inside_tripletons):
            doubleton_tuple = (doubleton[0], doubleton[1])
            if doubleton_tuple not in doubletons_set:
                tripleton_candidate_rejected = 1
                break
        
        if tripleton_candidate_rejected > 0:
            continue
        
        # Add frequent tripleton to the database.
        res = c.execute('''
            SELECT count(fpt1.project_id)
            FROM fc_project_tags fpt1
            INNER JOIN fc_project_tags fpt2
            ON (fpt1.project_id = fpt2.project_id)
            INNER JOIN fc_project_tags fpt3
            ON (fpt2.project_id = fpt3.project_id)
            WHERE fpt1.tag_name = ? AND
                  fpt2.tag_name = ? AND
                  fpt3.tag_name = ?
        ''', (candidate[0], 
              candidate[1], 
              candidate[2])).fetchone()
        
        count = res[0]
        
        if count > min_support:
            print(f'{candidate} [{count}]')
            
            try:
                c.execute('''
                    INSERT INTO fc_project_tag_triples 
                        (tag1, tag2, tag3, num_projs)
                    VALUES (?, ?, ?, ?)
                ''', (candidate[0],
                      candidate[1],
                      candidate[2],
                      count))
            except sqlite3.IntegrityError as e:
                pass
    # Persist in database.
    conn.commit()

In [239]:
find_tripletons()

['C', 'Linux', 'POSIX'] [4629]
['C', 'GPL', 'Linux'] [3299]
['C', 'GPL', 'POSIX'] [4364]
['Dynamic Content', 'Internet', 'Web'] [3166]
['Internet', 'OS Independent', 'Web'] [2519]
['GPL', 'Internet', 'Web'] [2878]
['C++', 'Linux', 'POSIX'] [2622]
['Linux', 'POSIX', 'Windows'] [3315]
['GPL', 'Linux', 'POSIX'] [7384]


In [240]:
def calculate_support_confidence_added_value(tag1, tag2, tag3, rule_support):
    # Support.
    rule_support_pct = round((rule_support / basket_count), 2)
    
    # Confidence.
    query1 = '''
        SELECT num_projs
        FROM fc_project_tag_pairs
        WHERE tag1 = ? and tag2 = ?
        OR tag2 = ? and tag1 = ?
    '''
    c = conn.cursor()
    res = c.execute(query1, (tag1, tag2, tag2, tag1)).fetchone()
    
    pair_support = res[0]
    confidence = round((rule_support / pair_support), 2)
    
    # Added value.
    query2 = '''
        SELECT COUNT(*)
        FROM fc_project_tags
        WHERE tag_name = ?
    '''
    
    res = c.execute(query2, (tag3,)).fetchone()
    support_tag3 = res[0]
    support_tag3_pct = support_tag3 / basket_count
    added_value = round((confidence - support_tag3_pct), 2)
    
    # Result.
    print(f'{tag1}, {tag2} -> {tag3} [S={rule_support_pct}, C={confidence}, AV={added_value}]')

In [241]:
def generate_rules():
    # Pull final list of tripletons to make the rules.
    c = conn.cursor()
    triples = c.execute('''
        SELECT tag1, tag2, tag3, num_projs
        FROM fc_project_tag_triples
    ''').fetchall()
    
    for (tag1, tag2, tag3, rule_support) in triples:
        calculate_support_confidence_added_value(tag1, tag2, tag3, rule_support)
        calculate_support_confidence_added_value(tag1, tag3, tag2, rule_support)
        calculate_support_confidence_added_value(tag2, tag3, tag1, rule_support)
        print('*')

In [242]:
generate_rules()

C, Linux -> POSIX [S=0.1, C=0.82, AV=0.46]
C, POSIX -> Linux [S=0.1, C=0.67, AV=0.32]
Linux, POSIX -> C [S=0.1, C=0.39, AV=0.17]
*
C, GPL -> Linux [S=0.07, C=0.6, AV=0.25]
C, Linux -> GPL [S=0.07, C=0.58, AV=0.12]
GPL, Linux -> C [S=0.07, C=0.41, AV=0.19]
*
C, GPL -> POSIX [S=0.09, C=0.79, AV=0.43]
C, POSIX -> GPL [S=0.09, C=0.63, AV=0.17]
GPL, POSIX -> C [S=0.09, C=0.43, AV=0.21]
*
Dynamic Content, Internet -> Web [S=0.07, C=1.0, AV=0.87]
Dynamic Content, Web -> Internet [S=0.07, C=1.0, AV=0.83]
Internet, Web -> Dynamic Content [S=0.07, C=0.53, AV=0.46]
*
Internet, OS Independent -> Web [S=0.05, C=0.84, AV=0.71]
Internet, Web -> OS Independent [S=0.05, C=0.42, AV=0.2]
OS Independent, Web -> Internet [S=0.05, C=0.97, AV=0.8]
*
GPL, Internet -> Web [S=0.06, C=0.71, AV=0.58]
GPL, Web -> Internet [S=0.06, C=0.99, AV=0.82]
Internet, Web -> GPL [S=0.06, C=0.48, AV=0.02]
*
C++, Linux -> POSIX [S=0.06, C=0.76, AV=0.4]
C++, POSIX -> Linux [S=0.06, C=0.75, AV=0.4]
Linux, POSIX -> C++ [S=0.06, C

In [243]:
X = 'Internet'
y = 'Web'

c = conn.cursor()

num_baskets_query = '''
    SELECT count(DISTINCT project_id)
    FROM fc_project_tags
'''

num_baskets = c.execute(num_baskets_query).fetchone()[0]

In [244]:
support_for_X_query = '''
    SELECT count(*)
    FROM fc_project_tags
    WHERE tag_name = ?
'''

support_for_X = c.execute(support_for_X_query, (X,)).fetchone()[0]

In [245]:
support_for_y_query = '''
    SELECT count(*)
    FROM fc_project_tags
    WHERE tag_name = ?
'''

support_for_y = c.execute(support_for_y_query, (y,)).fetchone()[0]

In [246]:
pair_support_query = '''
    SELECT num_projs
    FROM fc_project_tag_pairs
    WHERE tag1 = ? AND tag2 = ?
'''

pair_support = c.execute(pair_support_query, (X, y)).fetchone()[0]

In [261]:
# Calculate support: support of pair, divided by number of baskets.

pair_support_as_pct = pair_support / num_baskets

# Calculate confidence of X -> y.
support_for_X_as_pct = support_for_X / num_baskets
confidence_X_y = pair_support_as_pct / support_for_X_as_pct

# Calculate confidence of y -> X.
support_for_y_as_pct = support_for_y / num_baskets
confidence_y_X = pair_support_as_pct / support_for_y_as_pct

# Calculate added value X -> y.
av_X_y = confidence_X_y - support_for_y_as_pct
av_y_X = confidence_y_X - support_for_X_as_pct

print(f'Support for {X} U {y}: {pair_support_as_pct:.2f}')
print(f'Confidence {X} -> {y}: {confidence_X_y:.2f}')
print(f'Confidence {y} -> {X}: {confidence_y_X:.2f}')
print(f'Added Value {X} -> {y}: {av_X_y:.2f}')
print(f'Added Value {y} -> {X}: {av_y_X:.2f}')

Support for Internet U Web: 0.13
Confidence Internet -> Web: 0.74
Confidence Web -> Internet: 0.95
Added Value Internet -> Web: 0.60
Added Value Web -> Internet: 0.78


In [None]:
conn.close()