In [10]:
import pandas as pd
import os
import numpy as np
import scipy
import math
import random
import sklearn
import nltk
import warnings
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import roc_curve, auc

In [11]:
# Set display options to show full content
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows
pd.set_option('display.max_colwidth', None) # Show full content of each column

In [2]:
cwd = os.getcwd()
input = cwd + "\\input\\"

In [3]:
df_keywords = pd.read_csv(input+"cleaned\\keywords.csv")
df_movies = pd.read_csv(input+"cleaned\\movies_metadata.csv")
df_ratings = pd.read_csv(input+"ratings_small.csv")

In [9]:
df_keywords.head()

Unnamed: 0,id,keywords
0,862,"jealousy, toy, boy, friendship, friends, rivalry, boy next door, new toy, toy comes to life"
1,8844,"board game, disappearance, based on children's book, new home, recluse, giant insect"
2,15602,"fishing, best friend, duringcreditsstinger, old men"
3,31357,"based on novel, interracial relationship, single mother, divorce, chick flick"
4,11862,"baby, midlife crisis, confidence, aging, daughter, mother daughter relationship, pregnancy, contraception, gynecologist"


In [8]:
df_movies.head()

Unnamed: 0,id,title,release_date,adult,belongs_to_collection,genres,runtime,original_language,overview,production_companies,production_countries,spoken_languages,revenue,budget,vote_average,vote_count,popularity
0,862,Toy Story,1995-10-30,False,Toy Story Collection,"Animation, Comedy, Family",81.0,en,"Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",Pixar Animation Studios,United States of America,English,373554033.0,30000000,7.7,5415.0,21.946943
1,8844,Jumanji,1995-12-15,False,,"Adventure, Fantasy, Family",104.0,en,"When siblings Judy and Peter discover an enchanted board game that opens the door to a magical world, they unwittingly invite Alan -- an adult who's been trapped inside the game for 26 years -- into their living room. Alan's only hope for freedom is to finish the game, which proves risky as all three find themselves running from giant rhinoceroses, evil monkeys and other terrifying creatures.","TriStar Pictures, Teitler Film, Interscope Communications",United States of America,"English, Français",262797249.0,65000000,6.9,2413.0,17.015539
2,15602,Grumpier Old Men,1995-12-22,False,Grumpy Old Men Collection,"Romance, Comedy",101.0,en,"A family wedding reignites the ancient feud between next-door neighbors and fishing buddies John and Max. Meanwhile, a sultry Italian divorcée opens a restaurant at the local bait shop, alarming the locals who worry she'll scare the fish away. But she's less interested in seafood than she is in cooking up a hot time with Max.","Warner Bros., Lancaster Gate",United States of America,English,0.0,0,6.5,92.0,11.7129
3,31357,Waiting to Exhale,1995-12-22,False,,"Comedy, Drama, Romance",127.0,en,"Cheated on, mistreated and stepped on, the women are holding their breath, waiting for the elusive ""good man"" to break a string of less-than-stellar lovers. Friends and confidants Vannah, Bernie, Glo and Robin talk it all out, determined to find a better way to breathe.",Twentieth Century Fox Film Corporation,United States of America,English,81452156.0,16000000,6.1,34.0,3.859495
4,11862,Father of the Bride Part II,1995-02-10,False,Father of the Bride Collection,Comedy,106.0,en,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.","Sandollar Productions, Touchstone Pictures",United States of America,English,76578911.0,0,5.7,173.0,8.387519


In [12]:
df_ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [13]:
users_interactions_count_df = df_ratings.groupby(['userId', 'movieId']).size().groupby('userId').size()
print('# users: %d' % len(users_interactions_count_df))

users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5]\
                                    .reset_index()[['userId']]


users_with_few_interactions_df = users_interactions_count_df[users_interactions_count_df < 3]\
                                    .reset_index()[['userId']]

print('# users with at least 5 interactions: %d' % len(users_with_enough_interactions_df))
print('# users with less than 3 interactions: %d' % len(users_with_few_interactions_df))

# users: 671
# users with at least 5 interactions: 671
# users with less than 3 interactions: 0


In [17]:
interactions_from_selected_users_df = df_ratings.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'userId',
               right_on = 'userId')

interactions_from_few_selected_users_df = df_ratings.merge(users_with_few_interactions_df, 
               how = 'right',
               left_on = 'userId',
               right_on = 'userId')

print('# of interactions from all users: %d' % len(df_ratings))
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))
print('# of interactions from users with less than 3 interactions: %d' % len(interactions_from_few_selected_users_df))

# of interactions from all users: 100004
# of interactions from users with at least 5 interactions: 100004
# of interactions from users with less than 3 interactions: 0
