Keep users with ≥20 ratings
Keep top 10k most-rated movies
Convert ratings ≥4 → interaction = 1

In [61]:
import pandas as pd
import math

In [None]:
df_movies = pd.read_csv("ml-25m/movies.csv", index_col="movieId")
df_ratings = pd.read_csv("ml-25m/ratings.csv")
print(df_movies.shape)
print(df_ratings.shape)

(62423, 2)
(25000095, 4)


In [5]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [6]:
counts = {}
for row in df_ratings.itertuples():
    id = row.movieId
    if id in counts:
        counts[id]+=1
    else:
        counts[id] = 1

In [7]:
freq = []
for id in counts:
    freq.append((id,counts[id]))

In [8]:
sorted_data = sorted(freq, key=lambda x: x[1], reverse=True)
print(f"Total no of rated movies: {len(sorted_data)}")

Total no of rated movies: 59047


In [9]:
# For now, we're only going to be working with the top 5000 movies, reducing our database
movies = sorted_data[:5000]
print(movies)

[(356, 81491), (318, 81482), (296, 79672), (593, 74127), (2571, 72674), (260, 68717), (480, 64144), (527, 60411), (110, 59184), (2959, 58773), (589, 57379), (1196, 57361), (1, 57309), (4993, 55736), (50, 55366), (1210, 54917), (1198, 54675), (2858, 53689), (858, 52498), (5952, 51138), (7153, 50797), (47, 50596), (457, 49638), (1270, 49595), (780, 48902), (150, 48377), (608, 47823), (32, 47054), (2028, 46783), (2762, 46713), (3578, 44656), (592, 44110), (588, 43373), (364, 42745), (4306, 42303), (380, 41673), (590, 41615), (58559, 41519), (377, 41302), (4226, 41195), (1580, 40308), (1704, 38947), (79132, 38895), (1240, 38612), (1291, 37908), (1197, 37863), (1136, 37723), (1721, 37712), (1265, 37616), (344, 37453), (6539, 37227), (648, 37035), (1036, 36716), (541, 36702), (1214, 36357), (1193, 36058), (595, 35723), (6377, 34712), (1089, 34634), (367, 34621), (1097, 34602), (4886, 34572), (4973, 34320), (597, 34234), (500, 34194), (1221, 34188), (6874, 33827), (1682, 33731), (165, 33731),

In [10]:
i = 0
for movie in movies:
    if i > 10:
        break
    else:
        print(df_movies.loc[movie[0], "title"])
    i+=1
 

Forrest Gump (1994)
Shawshank Redemption, The (1994)
Pulp Fiction (1994)
Silence of the Lambs, The (1991)
Matrix, The (1999)
Star Wars: Episode IV - A New Hope (1977)
Jurassic Park (1993)
Schindler's List (1993)
Braveheart (1995)
Fight Club (1999)
Terminator 2: Judgment Day (1991)


let's make a new sampled down csv which only contains entries from our top 5k movies

In [22]:
top_k = df_ratings[df_ratings["movieId"].isin([x[0] for x in movies])]

top_k.to_csv("top5kratings.csv", index=False)

In [23]:
# now top_k contains all the ratings related to the first top5k movies
top_k.shape

(23062613, 4)

### Converting rationgs into interactions 

| rating | interaction |
| ------ | ----------- |
| ≥ 4    | 1           |
| < 4   | ignore      |


In [24]:
top_k = top_k[top_k["rating"] >= 4].copy()
top_k["rating"] = 1

Removing all users who have rated less than 8 films

In [25]:
MIN_LIKES = 8 
user_like_counts = top_k.groupby("userId").size()

valid_users = user_like_counts[user_like_counts >= MIN_LIKES].index
top_k = top_k[top_k["userId"].isin(valid_users)]

top_k.to_csv("top5kratings.csv", index=False)

print("Remaining users:", top_k["userId"].nunique())

Remaining users: 157188


for each user, we need to make pairs of all the movies he likes.
If there are n users, and each users likes m movies. Then we are left with $ n * (m) * (m-1) * 0.5$ pairs 

We can compress this pairs by building a matrix of movies in which 

$ mat[i][j] = count $

shows how many times that pair occurs.

In [None]:
cooccur = {}
# if i < j 
# we don't care about the users at this point, we just want co-occurence of movies together.

# for a particular user, make list of all likes pairs

# for index, row in top_k.iterrows():
    


In [42]:
rows, cols = top_k.shape

top_k = top_k.reset_index(drop=True)
top_k.loc[2]

userId                1
movieId             665
rating                1
timestamp    1147878820
Name: 2, dtype: int64

In [None]:
# This creates a Series where the index is userId and the value is a list of movieIds
user_movie_lists = top_k.groupby("userId")["movieId"].apply(list)

pandas.core.series.Series

In [49]:
cooccur = {}

for user_id, movies in user_movie_lists.items():
    movies = list(set(movies))
    for i in range(len(movies)):
        for j in range(i+1, len(movies)):

            mini = min(movies[i], movies[j])
            maxi = max(movies[i], movies[j])

            if mini not in cooccur:
                cooccur[mini] = {}

            if maxi not in cooccur[mini]:
                cooccur[mini][maxi] = 0

            cooccur[mini][maxi] += 1


In [60]:
sum = 0
for i in cooccur:
    sum += len(cooccur[i])

print(sum)

11897138


Using Pointwise Mutual Information to remove popularity bias

In [78]:
movie_likes = {}
for user, movies in user_movie_lists.items():
    for m in set(movies):          # deduplicated per user
        movie_likes[m] = movie_likes.get(m, 0) + 1



In [76]:
TOTAL_USERS = len(user_movie_lists)
TOTAL_USERS

157188

In [79]:
pmi_graph = {}

for i in cooccur:
    pmi_graph[i] = {}
    for j, cij in cooccur[i].items():
        val = math.log((cij * TOTAL_USERS) /
                       (movie_likes[i] * movie_likes[j]))

        if val > 0:                     # keep only meaningful associations
            pmi_graph[i][j] = val
