In [1]:
import numpy as np
import pandas as pd

<h3>Get list of users, movies and ratings</h3>

In [2]:
# Get list of users, movies and ratings
users = []
movies = []
ratings = []

with open('ids.txt', 'r') as f:
    users = [sid.replace('\n', '') for sid in f.readlines()]
    
with open('movies.txt', 'r') as f:
    movies = [name.replace('\n', '') for name in f.readlines()]
    
with open('ratings.txt', 'r') as f:
    for line in f.readlines():
        ratings.append(line.replace('\n', '').split(' '))
    
ratings = np.array(ratings)

<h3>Sort movies on mean popularity rating</h3>

In [3]:
# Sort movies on mean popularity rating
meanPop = []
for i in range(0,len(movies)):
    pop = np.sum((ratings[:,i] == '1').astype(int))/np.sum((ratings[:,i] != '?').astype(int))
    meanPop.append((pop, movies[i]))
    
meanPop.sort()
print([name[1] for name in meanPop])

['Chappaquidick', 'The_Last_Airbender', 'I_Feel_Pretty', 'Fifty_Shades_of_Grey', 'Fast_&_Furious:_Hobbs_&_Shaw', 'Hustlers', 'Magic_Mike', 'Bridemaids', 'World_War_Z', 'The_Shape_of_Water', 'Good_Boys', 'Prometheus', 'Pokemon_Detective_Pikachu', 'American_Hustle', 'Terminator:_Dark_Fate', 'The_Farewell', 'Man_of_Steel', 'Fast_Five', 'The_Hateful_Eight', 'Star_Wars:_The_Force_Awakens', 'Rocketman', 'The_Help', 'Drive', 'The_Girls_with_the_Dragon_Tattoo', 'Thor', 'Avengers:_Age_of_Ultron', 'Phantom_Thread', 'Us', 'The_Revenant', 'X-Men:_First_Class', 'Pitch_Perfect', 'Dunkirk', 'Ready_Player_One', 'Room', 'Jurassic_World', 'Mad_Max:_Fury_Road', 'Once_Upon_a_Time_in_Hollywood', 'Manchester_by_the_Sea', 'The_Perks_of_Being_a_Wallflower', 'Spiderman:_Far_From_Home', 'Her', 'Captain_America:_The_First_Avenger', 'Frozen', 'Hidden_Figures', 'The_Hunger_Games', 'Iron_Man_2', 'Les_Miserables', 'Toy_Story_3', 'Three_Billboards_Outside_Ebbing', 'Darkest_Hour', 'Ex_Machina', 'Gone_Girl', 'Black_Swa

<h3>Expectation Maximization</h3>

In [4]:
# Part E

# Function to compute log-likelihood
def LogLikelihood(T, K, N, ratings, pRGivenZ, pZ):
    likelihood = 0.0
    
    for t in range(0, T):
        prob = 0.0
        for i in range(0, K):
            prod = 1.0
            for j in range(0, N):
                if(ratings[t, j] == '1'):
                    prod *= pRGivenZ[i, j]
                elif(ratings[t, j] == '0'):
                    prod *= (1 - pRGivenZ[i, j])
                    
            prob += prod*pZ[i]
            
        likelihood += np.log(prob)
        
    likelihood /= T
    
    return likelihood
    
# Function to perform EMUpdate
def EMUpdate(K, T, N, ratings, pRGivenZ, pZ):
    rho = np.zeros((K, T))
    
    for i in range(0, K):
        for t in range(0, T):
            prod = 1.0
            for j in range(0, N):
                if(ratings[t, j] == '1'):
                    prod *= pRGivenZ[i, j]
                elif(ratings[t, j] == '0'):
                    prod *= (1 - pRGivenZ[i, j])
                    
            rho[i, t] = prod*pZ[i]
        
    rho /= np.sum(rho, axis=0)
    
    pZ = np.sum(rho, axis=1)/T
    
    for i in range(0, K):
        for j in range(0, N):
            numer = 0.0
            for t in range(0, T):
                if(ratings[t, j] == '1'):
                    numer += rho[i, t]
                elif(ratings[t, j] == '?'):
                    numer += pRGivenZ[i, j]*rho[i, t]
                    
            pRGivenZ[i, j] = numer/np.sum(rho[i,:])
            
    return pZ, pRGivenZ

In [5]:
# Initialize P(Z) and P(R|Z)
pZ = []
pRGivenZ = []

with open('hw8_probZ_init.txt', 'r') as f:
    pZ = [float(val) for val in f.readlines()]
    
pZ = np.array(pZ)
    
with open('hw8_probR_init.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        pRGivenZ.append([float(val) for val in line.split(' ') if val != ''])
    
pRGivenZ = np.transpose(np.array(pRGivenZ))

# Initialize number of users, movies, iterations and types of movie goers
iterations = 256
T = len(users)
K = len(pZ)
N = len(pRGivenZ[0])

# Print log likelihood and perform EM Update
for itr in range(iterations+1):
    if(itr==0 or np.power(2, int(np.log2(itr))) == itr):
        ll = LogLikelihood(T, K, N, ratings, pRGivenZ, pZ)
        print("Iteration ", itr, " Log-likelhood = ", ll)
    pZ, pRGivenZ = EMUpdate(K, T, N, ratings, pRGivenZ, pZ)

Iteration  0  Log-likelhood =  -28.627324487337628
Iteration  1  Log-likelhood =  -19.350314946503318
Iteration  2  Log-likelhood =  -17.909564818017916
Iteration  4  Log-likelhood =  -17.081155562337013
Iteration  8  Log-likelhood =  -16.629824767528117
Iteration  16  Log-likelhood =  -16.28782872191562
Iteration  32  Log-likelhood =  -15.801537953970273
Iteration  64  Log-likelhood =  -15.749887678844292
Iteration  128  Log-likelhood =  -15.735940712575662
Iteration  256  Log-likelhood =  -15.728520329683299


<h3>Personal recommendation</h3>

In [6]:
# Part F
myId = users.index('A59018415')
myRatings = ratings[myId]

postProb = []
for i in range(0, K):
    prod = 1.0
    for j in range(0, N):
        if(myRatings[j] == '1'):
            prod *= pRGivenZ[i, j]
        elif(myRatings[j] == '0'):
            prod *= (1 - pRGivenZ[i, j])
            
    postProb.append(pZ[i]*prod)

postProb /= np.sum(postProb)
    
unseenMovieScores = []
for l in range(0, N):
    if(myRatings[l] == '?'):
        score = 0.0
        for i in range(0, K):
            score += pRGivenZ[i, l]*postProb[i]
            
        unseenMovieScores.append((score, movies[l]))
        
unseenMovieScores.sort(reverse=True)

df = pd.DataFrame()
df['name'] = [x[1] for x in unseenMovieScores]
df['score'] = [x[0] for x in unseenMovieScores]

print(df)

                                name     score
0                           Parasite  0.868957
1                         La_La_Land  0.793830
2                             Frozen  0.760267
3                   Ready_Player_One  0.757193
4                   Django_Unchained  0.752938
5                     21_Jump_Street  0.751170
6                  Midnight_in_Paris  0.725048
7                   The_Great_Gatsby  0.715400
8                         Black_Swan  0.692148
9                               Room  0.684863
10                    Hidden_Figures  0.676849
11                      Darkest_Hour  0.673471
12                  12_Years_a_Slave  0.668835
13         Pokemon_Detective_Pikachu  0.644331
14                    Les_Miserables  0.625264
15                    Phantom_Thread  0.603146
16                     Pitch_Perfect  0.598900
17   Three_Billboards_Outside_Ebbing  0.590423
18                         Rocketman  0.579052
19                                Us  0.572354
20           