# Movie Recommendation System
## Web Information Management: Project II #
In this project, I will develop different algorithms to make recommendations for movies.
<hr>

### Data import and export functions:

In [2]:
import numpy as np
import pandas as pd

UPDATE_INT = 10

def fetch_train():
    data = pd.read_csv('data/train.txt', delimiter='\t', header=None, dtype=int)
    return data

def fetch_test(fn):
    data = pd.read_csv('data/'+fn, delimiter=' ', header=None, names=['U','M','R'], dtype=int)
    return data

def write_test(data, fn):
    data.to_csv('result/'+fn, sep=' ', header=False, index=False)

### Helper functions:

In [3]:
def remove_zeros(a, b):
    assert len(a)==len(b)
    ra = np.array([])
    rb = np.array([])
    for x1, x2 in zip(a,b):
        if x1 and x2:
            ra = np.append(ra, x1)
            rb = np.append(rb, x2)
    return ra, rb

def cos_sim(a, b):
    assert a.shape == b.shape, "{} != {}".format(a.shape, b.shape)
    if np.sum(b)==0:
        return 0
    
    # remove 0's
    ta, tb = remove_zeros(a, b)
    if len(ta)<2 or len(tb)<2:
        return 0
    
    # cosine similarity
    num = ta.dot(tb)
    den = np.linalg.norm(ta)*np.linalg.norm(tb)
    return num/den

def pea_cor(a, b):
    assert a.shape == b.shape, "{} != {}".format(a.shape, b.shape)
    
    # remove 0's
    ta, tb = remove_zeros(a, b)
    
    # remove 1 element arrays?
    if len(ta)<2 or len(tb)<2:
        return 0
    
    # subtract average
    ta = ta - np.mean(ta)
    tb = tb - np.mean(tb)
    
    # cosine similarity
    num = ta.dot(tb)
    den = np.linalg.norm(ta)*np.linalg.norm(tb)
    return (num/den) if den else 0

def weighted_avg(w, r, absval=False):
    assert w.shape == r.shape, "{} != {}".format(w.shape, r.shape)
    if np.sum(w) == 0:
        return 0
    if absval:
        return np.sum(w*r)/np.sum(np.absolute(w))
    return np.sum(w*r)/np.sum(w)

def count_col(arr, target, column):
    for t in target:
        count = arr[arr[column]==t].count()
        print("! {} appears {} times in column {}".format(t, count[column], column))
    return 0

def constrain_array(arr, lower=1, upper=5):
    arr[arr<lower] = lower
    arr[arr>upper] = upper
    count_col(arr, [lower-1, upper+1], 'R')
    return arr
    

### Problem 2 Functions:
> Here, I implemented several user-based collaborative filtering algorithms, including modifications such as cosine similarity, Pearson correlation, inverse user frequency, and case modification.
- Cosine Similarity
- Pearson Correlation
- Pearson Correlation w/ Inverse User Frequency
- Pearson Correlation w/ Case Modification

In [4]:
## Cosine Similarity
def problem2_cs(testfile, outfile=None):
    
    # fetch training/testing data
    train = fetch_train()
    test = fetch_test(testfile)
    
    # create rng, a list of users to solve for
    rng = (test.U.min(), test.U.max()+1)
    results = pd.DataFrame(columns=test.columns)
    
    # loop through each user
    for i in range(rng[0], rng[1]):
        
        # separate known and nuknown ratings
        ratings = test[test.U==i]
        known = ratings[ratings.R!=0]
        unknown = ratings[ratings.R==0]
        
        # calculate USER similarity by comparing each rating R in 'known' against every other movie rating
        user_sim = train.apply(lambda x: cos_sim(known.R.values, x[known.M].values), axis=1)
        
        # separate relevant values and users
        user_sim_sort = user_sim.sort_values(ascending=False)
        rel_vals = user_sim[user_sim>0]
        rel_users = rel_vals.index.values
        
        # rating prediction
        for j, r in unknown.iterrows():
            rel_ratings = train.iloc[rel_users, r.M-1]
            rv, rr = remove_zeros(rel_vals.values, rel_ratings.values)
            if len(rv) and len(rr):
                r.R = round(weighted_avg(rv, rr))
            else:
                r.R = 3
            
        results = pd.concat([results,unknown], ignore_index=True)
        
        # print update
        if (i%UPDATE_INT == 0):
            print("User {} ({} predictions)...".format(i, len(unknown.M)))
            
    constrain_array(results)
        
    if outfile:
        write_test(results, outfile)
        print('> Results written to {}\n'.format(outfile))
        
    return results

# Pearson Correlation
def problem2_pc(testfile, outfile=None):
    train = fetch_train()
    test = fetch_test(testfile)
    
    rng = (test.U.min(), test.U.max()+1)
    results = pd.DataFrame(columns=test.columns)
    
    for i in range(rng[0], rng[1]):
        ratings = test[test.U==i]
        known = ratings[ratings.R!=0]
        unknown = ratings[ratings.R==0]
        
        similarity = train.apply(lambda x: pea_cor(known.R.values, x[known.M].values), axis=1)
        
        rel_vals = similarity[similarity!=0]
        rel_users = rel_vals.index.values
        
        avg_rating = np.mean(known.R)
        
        # rating prediction
        for j, r in unknown.iterrows():
            rel_ratings = train.iloc[rel_users, r.M-1]
            rel_ratings = rel_ratings - np.mean(rel_ratings)
            rv, rr = remove_zeros(rel_vals.values, rel_ratings.values)
            if len(rv) and len(rr):
                r.R = round(avg_rating + weighted_avg(rv, rr, True))
            else:
                r.R = round(avg_rating)
            
        results = pd.concat([results,unknown], ignore_index=True)
        
        # print update
        if (i%UPDATE_INT == 0):
            print("User {} ({} predictions)...".format(i, len(unknown.M)))
    
    constrain_array(results)
        
    if outfile:
        write_test(results, outfile)
        print('> Results written to {}\n'.format(outfile))
        
    return results

# Pearson Correlation w/ Inverse User Frequency
def problem2_pciuf(testfile, outfile=None):
    train = fetch_train()
    test = fetch_test(testfile)
    
    m = len(train)
    iuf = []
    for i,c in train.iteritems():
        mj = c[c!=0].count()
        iuf.append(np.log(m/mj) if mj else 0.0)
    train_iuf = train*iuf
    
    rng = (test.U.min(), test.U.max()+1)
    results = pd.DataFrame(columns=test.columns)
    
    for i in range(rng[0], rng[1]):
        ratings = test[test.U==i]
        known = ratings[ratings.R!=0]
        unknown = ratings[ratings.R==0]
        
        similarity = train_iuf.apply(lambda x: pea_cor(known.R.values, x[known.M].values), axis=1)
        
        rel_vals = similarity[similarity!=0]
        rel_users = rel_vals.index.values
        
        avg_rating = np.mean(known.R)
        
        # rating prediction
        for j, r in unknown.iterrows():
            rel_ratings = train.iloc[rel_users, r.M-1]
            rel_ratings = rel_ratings - np.mean(rel_ratings)
            rv, rr = remove_zeros(rel_vals.values, rel_ratings.values)
            if len(rv) and len(rr):
                r.R = round(avg_rating + weighted_avg(rv, rr, True))
            else:
                r.R = round(avg_rating)
            
        results = pd.concat([results,unknown], ignore_index=True)
        
        # print update
        if (i%UPDATE_INT == 0):
            print("User {} ({} predictions)...".format(i, len(unknown.M)))
        
    constrain_array(results)
        
    if outfile:
        write_test(results, outfile)
        print('> Results written to {}\n'.format(outfile))
        
    return results

# Pearson Correlation w/ Case Modification
def problem2_pccm(testfile, outfile=None):
    train = fetch_train()
    test = fetch_test(testfile)
    
    rng = (test.U.min(), test.U.max()+1)
    results = pd.DataFrame(columns=test.columns)
    
    for i in range(rng[0], rng[1]):
        ratings = test[test.U==i]
        known = ratings[ratings.R!=0]
        unknown = ratings[ratings.R==0]
        
        similarity = train.apply(lambda x: pea_cor(known.R.values, x[known.M].values), axis=1)
        
        rel_vals = similarity[similarity!=0]
        rel_users = rel_vals.index.values
        
        avg_rating = np.mean(known.R)
        
        # rating prediction
        for j, r in unknown.iterrows():
            rel_ratings = train.iloc[rel_users, r.M-1]
            rel_ratings = rel_ratings - np.mean(rel_ratings)
            rv, rr = remove_zeros(rel_vals.values, rel_ratings.values)
            if len(rv) and len(rr):
                r.R = round(avg_rating + weighted_avg(rv, rr, True))
            else:
                r.R = round(avg_rating)
            
        results = pd.concat([results,unknown], ignore_index=True)
        
        # print update
        if (i%UPDATE_INT == 0):
            print("User {} ({} predictions)...".format(i, len(unknown.M)))
        
    constrain_array(results)
        
    if outfile:
        write_test(results, outfile)
        print('> Results written to {}\n'.format(outfile))
        
    return results

# should k be same value every time?
# should i remove cases with only 1 rating?

### Problem 3 Function:
> Here I implemented a basic item-based collaborative filtering algorithm.

In [19]:
def problem3(testfile, outfile=None):
    # fetch training/testing data
    train = fetch_train()
    test = fetch_test(testfile)
    
    # create rng, a list of users to solve for
    rng = (test.U.min(), test.U.max()+1)
    results = pd.DataFrame(columns=test.columns)
    
    unknown = test[test.R==0]
    ct = 0
    
    # iterate through all unranked movies
    for i,r in unknown.iterrows():
        known = test[(test.R!=0) & (test.U==r.U)]
        
        # generate the ITEM similarity matrix (using cos_sim)
        item_sim = train[known.M].apply(lambda x: cos_sim(train[r.M-1], x), axis=0)
        
        # sort the similarity matrix in descending order
        item_sim_sort = item_sim.sort_values(ascending=False)
        
        # compute ratings
        r.R = round(weighted_avg(item_sim[known.M], known.R))
             
        results = pd.concat([results,unknown], ignore_index=True)
        
        # progress update
        ct = ct + 1
        if ct%100 == 0:
            print("Completed {} iterations... (known {})".format(ct, known.shape))
            
    count_col(results, [0, 6], 'R')
        
    if outfile:
        write_test(results, outfile)
        print('> Results written to {}\n'.format(outfile))
        
    return results

### Problem 4 Function:
> This is a personal algorithm I created to try to achieve better MAE than the previous methods.

### Testing and exporting:

In [335]:
test01 = problem2_cs('test5.txt', 'result5_cs.txt')
# test02 = problem2_cs('test10.txt', 'result10_cs.txt')
# test03 = problem2_cs('test20.txt', 'result20_cs.txt')

User 210 (19 predictions)...
User 220 (19 predictions)...
User 230 (43 predictions)...
User 240 (28 predictions)...
User 250 (364 predictions)...
User 260 (39 predictions)...
User 270 (37 predictions)...
User 280 (37 predictions)...
User 290 (36 predictions)...
User 300 (141 predictions)...
! 0 appears 0 times in column R
! 6 appears 0 times in column R
> Results written to result5_cs.txt



In [350]:
test04 = problem2_pc('test5.txt', 'result5_pc.txt')
test05 = problem2_pc('test10.txt', 'result10_pc.txt')
test06 = problem2_pc('test20.txt', 'result20_pc.txt')

User 210 (19 predictions)...
User 220 (19 predictions)...
User 230 (43 predictions)...
User 240 (28 predictions)...
User 250 (364 predictions)...
User 260 (39 predictions)...
User 270 (37 predictions)...
User 280 (37 predictions)...
User 290 (36 predictions)...
User 300 (141 predictions)...
! 0 appears 0 times in column R
! 6 appears 0 times in column R
> Results written to result5_pc.txt

User 310 (7 predictions)...
User 320 (11 predictions)...
User 330 (11 predictions)...
User 340 (134 predictions)...
User 350 (28 predictions)...
User 360 (9 predictions)...
User 370 (42 predictions)...
User 380 (28 predictions)...
User 390 (78 predictions)...
User 400 (41 predictions)...
! 0 appears 0 times in column R
! 6 appears 0 times in column R
> Results written to result10_pc.txt

User 410 (5 predictions)...
User 420 (58 predictions)...
User 430 (85 predictions)...
User 440 (60 predictions)...
User 450 (130 predictions)...
User 460 (257 predictions)...
User 470 (291 predictions)...
User 480 (7

In [349]:
test07 = problem2_pciuf('test5.txt', 'result5_pciuf.txt')
test08 = problem2_pciuf('test10.txt', 'result10_pciuf.txt')
test09 = problem2_pciuf('test20.txt', 'result20_pciuf.txt')

User 210 (19 predictions)...
User 220 (19 predictions)...
User 230 (43 predictions)...
User 240 (28 predictions)...
User 250 (364 predictions)...
User 260 (39 predictions)...
User 270 (37 predictions)...
User 280 (37 predictions)...
User 290 (36 predictions)...
User 300 (141 predictions)...
! 0 appears 0 times in column R
! 6 appears 0 times in column R
> Results written to result5_pciuf.txt

User 310 (7 predictions)...
User 320 (11 predictions)...
User 330 (11 predictions)...
User 340 (134 predictions)...
User 350 (28 predictions)...
User 360 (9 predictions)...
User 370 (42 predictions)...
User 380 (28 predictions)...
User 390 (78 predictions)...
User 400 (41 predictions)...
! 0 appears 0 times in column R
! 6 appears 0 times in column R
> Results written to result10_pciuf.txt

User 410 (5 predictions)...
User 420 (58 predictions)...
User 430 (85 predictions)...
User 440 (60 predictions)...
User 450 (130 predictions)...
User 460 (257 predictions)...
User 470 (291 predictions)...
User 

In [351]:
test10 = problem2_pccm('test5.txt', 'result5_pccm.txt')
test11 = problem2_pccm('test10.txt', 'result10_pccm.txt')
test12 = problem2_pccm('test20.txt', 'result20_pccm.txt')

User 210 (19 predictions)...
User 220 (19 predictions)...
User 230 (43 predictions)...
User 240 (28 predictions)...
User 250 (364 predictions)...
User 260 (39 predictions)...
User 270 (37 predictions)...
User 280 (37 predictions)...
User 290 (36 predictions)...
User 300 (141 predictions)...
! 0 appears 0 times in column R
! 6 appears 0 times in column R
> Results written to result5_pccm.txt

User 310 (7 predictions)...
User 320 (11 predictions)...
User 330 (11 predictions)...
User 340 (134 predictions)...
User 350 (28 predictions)...
User 360 (9 predictions)...
User 370 (42 predictions)...
User 380 (28 predictions)...
User 390 (78 predictions)...
User 400 (41 predictions)...
! 0 appears 0 times in column R
! 6 appears 0 times in column R
> Results written to result10_pccm.txt

User 410 (5 predictions)...
User 420 (58 predictions)...
User 430 (85 predictions)...
User 440 (60 predictions)...
User 450 (130 predictions)...
User 460 (257 predictions)...
User 470 (291 predictions)...
User 48

In [20]:
test13 = problem3('test5.txt', 'result5_ibcf.txt')
# test14 = problem3('test10.txt', 'result10_ibcf.txt')
# test15 = problem3('test20.txt', 'result20_ibcf.txt')

Completed 100 iterations... (known (5, 3))
Completed 200 iterations... (known (5, 3))
Completed 300 iterations... (known (5, 3))
Completed 400 iterations... (known (5, 3))
Completed 500 iterations... (known (5, 3))
Completed 600 iterations... (known (5, 3))
Completed 700 iterations... (known (5, 3))
Completed 800 iterations... (known (5, 3))
Completed 900 iterations... (known (5, 3))
Completed 1000 iterations... (known (5, 3))
Completed 1100 iterations... (known (5, 3))
Completed 1200 iterations... (known (5, 3))
Completed 1300 iterations... (known (5, 3))
Completed 1400 iterations... (known (5, 3))
Completed 1500 iterations... (known (5, 3))
Completed 1600 iterations... (known (5, 3))
Completed 1700 iterations... (known (5, 3))
Completed 1800 iterations... (known (5, 3))
Completed 1900 iterations... (known (5, 3))
Completed 2000 iterations... (known (5, 3))
Completed 2100 iterations... (known (5, 3))
Completed 2200 iterations... (known (5, 3))
Completed 2300 iterations... (known (5, 3

KeyboardInterrupt: 

PIN: 9423572820721098