In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
rating_names = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('data/ml-100k/u.data', sep='\t', encoding='latin-1',names=rating_names)
movie_names = ['movie_id', 'title', 'release_date']
movies = pd.read_csv('data/ml-100k/u.item', sep='|', names=movie_names,usecols=range(3),
                     encoding='latin-1')
data = pd.merge(movies,ratings)
print(data.columns)
print(data.head(3))
print(data.title.value_counts()[:25])

Index(['movie_id', 'title', 'release_date', 'user_id', 'rating', 'timestamp'], dtype='object')
   movie_id             title release_date  user_id  rating  timestamp
0         1  Toy Story (1995)  01-Jan-1995      308       4  887736532
1         1  Toy Story (1995)  01-Jan-1995      287       5  875334088
2         1  Toy Story (1995)  01-Jan-1995      148       4  877019411
Star Wars (1977)                             583
Contact (1997)                               509
Fargo (1996)                                 508
Return of the Jedi (1983)                    507
Liar Liar (1997)                             485
English Patient, The (1996)                  481
Scream (1996)                                478
Toy Story (1995)                             452
Air Force One (1997)                         431
Independence Day (ID4) (1996)                429
Raiders of the Lost Ark (1981)               420
Godfather, The (1972)                        413
Pulp Fiction (1994)              

In [101]:
def joint_pmf(movie_1,movie_2,verbose):
    ratings_1 = data[data["title"] == movie_1]
    ratings_2 = data[data["title"] == movie_2]
    if verbose:
        print(ratings_1[:2])
        print(ratings_2[:2])

    counts = np.zeros((5,5))
    total_counts = 0.0

    for index, row in ratings_1.iterrows():
        user_id = row['user_id']
        aux_2 = ratings_2[ratings_2["user_id"] == user_id]
        if len(aux_2)==0:
            if verbose:
                print("no match")
        elif len(aux_2)>1:
            if verbose:
                print("more than one match")
        else:
            rating_1 = row['rating']
            rating_2 = aux_2['rating'].values[0]
            if verbose:
                print("rating 1 : " + str(rating_1) + " rating 2: " + str(rating_2))
            counts[rating_1-1,rating_2-1] += 1
            total_counts += 1
    joint_pmf = counts/total_counts
    
    if verbose:
        print(counts)
        print(joint_pmf)
    
    return counts, joint_pmf

verbose = False
movie_1 = 'Mission: Impossible (1996)'
movie_2 = 'Independence Day (ID4) (1996)'
counts, joint_pmf = joint_pmf(movie_1,movie_2,verbose)
print(counts)
np.set_printoptions(precision=4)
print("Joint pmf of " + movie_1 + ' and ' + movie_2)
print(joint_pmf)

marginal_pmf_1 = np.sum(joint_pmf,axis=1)
print("Marginal pmf of " + movie_1)
print(marginal_pm_1))

marginal_pmf_2 = np.sum(joint_pmf,axis=0)
print("Marginal pmf of " + movie_2)
print(marginal_pmf_2))

for rat in range(5):
    print("Conditional pmf of " + movie_2 + " given rating for " + movie_1 + " = " + str(rat+1))
    print(joint_pmf[rat,:] / np.sum(joint_pmf[rat,:])) 
for rat in range(5):
    print("Conditional pmf of " + movie_1 + " given rating for " + movie_2 + " = " + str(rat+1))
    print(joint_pmf[:,rat] / np.sum(joint_pmf[:,rat]) )

[[ 2.  3.  5.  1.  0.]
 [ 3. 12. 18. 11.  5.]
 [ 5. 14. 37. 41. 17.]
 [ 6. 15. 20. 47. 19.]
 [ 0.  0.  4. 12. 17.]]
Joint pmf of Mission: Impossible (1996) and Independence Day (ID4) (1996)
[[0.0064 0.0096 0.0159 0.0032 0.    ]
 [0.0096 0.0382 0.0573 0.035  0.0159]
 [0.0159 0.0446 0.1178 0.1306 0.0541]
 [0.0191 0.0478 0.0637 0.1497 0.0605]
 [0.     0.     0.0127 0.0382 0.0541]]
Marginal pmf of Mission: Impossible (1996)
[0.035  0.1561 0.3631 0.3408 0.1051]
Marginal pmf of Independence Day (ID4) (1996)
[0.051  0.1401 0.2675 0.3567 0.1847]
Conditional pmf of Independence Day (ID4) (1996) given rating for Mission: Impossible (1996) = 1
[0.1818 0.2727 0.4545 0.0909 0.    ]
Conditional pmf of Independence Day (ID4) (1996) given rating for Mission: Impossible (1996) = 2
[0.0612 0.2449 0.3673 0.2245 0.102 ]
Conditional pmf of Independence Day (ID4) (1996) given rating for Mission: Impossible (1996) = 3
[0.0439 0.1228 0.3246 0.3596 0.1491]
Conditional pmf of Independence Day (ID4) (1996) given