In [2]:
# import necessary libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



import pyspark
from pyspark.sql.types import *
from pyspark.ml.tuning import TrainValidationSplit
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator


# Build our Spark Session and Context
spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext
spark, sc


from pyspark.sql.functions import lit
from pyspark.sql.types import StructType, StructField, IntegerType
from pyspark.sql.functions import countDistinct, col


In [8]:
def get_ratings_data():
    '''
    Returns:
        ratings_contents (dataframe): has columns "user", "movie", "rating"
        ratings_as_mat (sparse matrix): rows correspond to users and columns correspond
        to movies. Each element is the user's rating for that movie.
    '''
    ratings_contents = pd.read_table("data/movies.dat",
                                     names=["user", "movie", "rating", 'timestamp'])
    ratings_as_mat = sparse.csr_matrix((ratings_contents.rating, 
        ((ratings_contents.user), (ratings_contents.movie))))
    return ratings_contents, ratings_as_mat

In [21]:
# ratings_contents, ratings_as_mat = get_ratings_data()

In [22]:
# schema = StructType([
#     StructField("user", IntegerType(), True),
#     StructField("movie", IntegerType(), True),
#     StructField("rating", IntegerType(), True),
#     StructField("timestamp", IntegerType(), True)])



# df_ratings = spark.read.csv('data/movies.dat',delimiter='::', header=False, schema=schema)

In [23]:
#df_ratings.show()

In [26]:
movies = pd.read_csv('data/movies.dat', sep='::', names=['movie', 'name', 'genre'])

  """Entry point for launching an IPython kernel.


In [27]:
movies.movies == 

Unnamed: 0,movie,name,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [43]:
movies['genre'].unique()

array(["Animation|Children's|Comedy", "Adventure|Children's|Fantasy",
       'Comedy|Romance', 'Comedy|Drama', 'Comedy',
       'Action|Crime|Thriller', "Adventure|Children's", 'Action',
       'Action|Adventure|Thriller', 'Comedy|Drama|Romance',
       'Comedy|Horror', "Animation|Children's", 'Drama',
       'Action|Adventure|Romance', 'Drama|Thriller', 'Drama|Romance',
       'Thriller', 'Action|Comedy|Drama', 'Crime|Drama|Thriller',
       'Drama|Sci-Fi', 'Romance', 'Adventure|Sci-Fi', 'Adventure|Romance',
       "Children's|Comedy|Drama", 'Documentary', 'Drama|War',
       'Action|Crime|Drama', 'Action|Adventure', 'Crime|Thriller',
       "Animation|Children's|Musical|Romance", 'Action|Drama|Thriller',
       "Children's|Comedy", 'Drama|Mystery', 'Sci-Fi|Thriller',
       'Action|Comedy|Crime|Horror|Thriller', 'Drama|Musical',
       'Crime|Drama|Romance', 'Adventure|Drama', 'Action|Thriller',
       "Adventure|Children's|Comedy|Musical", 'Action|Drama|War',
       'Action|Adventur

In [28]:
df = pd.read_csv('data/training.csv')

In [29]:
df.head()

Unnamed: 0,user,movie,rating,timestamp
0,6040,858,4,956703932
1,6040,593,5,956703954
2,6040,2384,4,956703954
3,6040,1961,4,956703977
4,6040,2019,5,956703977


In [30]:
df.describe()

Unnamed: 0,user,movie,rating,timestamp
count,800000.0,800000.0,800000.0,800000.0
mean,3403.097837,1849.257256,3.590479,968392100.0
std,1546.589028,1086.852485,1.120376,5820931.0
min,636.0,1.0,1.0,956703900.0
25%,2035.0,1028.0,3.0,964152400.0
50%,3507.0,1788.0,4.0,967587800.0
75%,4695.0,2750.0,4.0,974687800.0
max,6040.0,3952.0,5.0,975767300.0


In [32]:
pivot_df = pd.pivot_table(df, values='rating', index='user', columns='movie' )

In [33]:
pivot_df

movie,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
636,,,,,,,,,,,...,,,,,,,,,,
637,5.0,,,,,,,,,,...,,,,,,,,,,
638,,,,,,,,,,,...,,,,,,,,,,
639,,,,,,,,,,,...,,,,,,,,,,
640,,,,,,4.0,,,,,...,,,,,,,,,,
641,,4.0,,,,,,,,,...,,,,,,,,,,
642,,,,,,,,,,,...,,,,,,,,,,
643,,,,,,,,,,,...,,,,,,,,,,
644,,,,,,,,,,,...,,,,,,4.0,,,,
645,5.0,,,,,,,,,,...,,,,,,,,,,


In [37]:
users = pd.read_csv('data/users.dat', sep='::', names=['user', 'gender', 'age', 'occupation', 'zipcode'])

  """Entry point for launching an IPython kernel.


In [42]:
users

Unnamed: 0,user,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,02460
4,5,M,25,20,55455
5,6,F,50,9,55117
6,7,M,35,1,06810
7,8,M,25,12,11413
8,9,M,25,17,61614
9,10,F,35,1,95370


In [44]:
from sklearn.preprocessing import MultiLabelBinarizer

In [45]:
keywords = pd.read_csv('data/keywords.csv')

In [54]:
keys = pd.read_json('data/keywords.csv')

ValueError: Expected object or value

In [49]:
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [53]:
movies[movies.movie == 862]

Unnamed: 0,movie,name,genre
851,862,Manny & Lo (1996),Drama


In [50]:
keywords.describe()

Unnamed: 0,id
count,46419.0
mean,109769.951873
std,113045.780256
min,2.0
25%,26810.5
50%,61198.0
75%,159908.5
max,469172.0


In [57]:
keywords['keywords'][0]

"[{'id': 931, 'name': 'jealousy'}, {'id': 4290, 'name': 'toy'}, {'id': 5202, 'name': 'boy'}, {'id': 6054, 'name': 'friendship'}, {'id': 9713, 'name': 'friends'}, {'id': 9823, 'name': 'rivalry'}, {'id': 165503, 'name': 'boy next door'}, {'id': 170722, 'name': 'new toy'}, {'id': 187065, 'name': 'toy comes to life'}]"

In [62]:
movies.head()

Unnamed: 0,movie,name,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [70]:
import json
import ast 
from functools import reduce

In [None]:
reduce()

In [75]:
def word(w):
    return w

In [76]:
x = reduce(word, map(lambda x: x.get('name'), ast.literal_eval(keywords['keywords'][0])))

TypeError: word() takes 1 positional argument but 2 were given

In [82]:
def plot_parser(row, key='name'):
    return list(map(lambda x: x.get(key), ast.literal_eval(row)))
    
    

In [88]:
movies.head(11)

Unnamed: 0,movie,name,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [85]:
keywords = keywords.keywords.apply(plot_parser)

In [91]:
keys = pd.DataFrame(keywords)

In [93]:
keys.to_csv('data/keys.csv')

In [80]:
list(map(lambda x: x.get('name'), ast.literal_eval(keywords['keywords'][0])))

['jealousy',
 'toy',
 'boy',
 'friendship',
 'friends',
 'rivalry',
 'boy next door',
 'new toy',
 'toy comes to life']

In [94]:
keys

Unnamed: 0,keywords
0,"[jealousy, toy, boy, friendship, friends, riva..."
1,"[board game, disappearance, based on children'..."
2,"[fishing, best friend, duringcreditsstinger, o..."
3,"[based on novel, interracial relationship, sin..."
4,"[baby, midlife crisis, confidence, aging, daug..."
5,"[robbery, detective, bank, obsession, chase, s..."
6,"[paris, brother brother relationship, chauffeu..."
7,[]
8,"[terrorist, hostage, explosive, vice president]"
9,"[cuba, falsely accused, secret identity, compu..."


In [None]:
keywords.keywords.

In [55]:
pd.DataFrame(keywords.keywords.values)

Unnamed: 0,0
0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."
5,"[{'id': 642, 'name': 'robbery'}, {'id': 703, '..."
6,"[{'id': 90, 'name': 'paris'}, {'id': 380, 'nam..."
7,[]
8,"[{'id': 949, 'name': 'terrorist'}, {'id': 1562..."
9,"[{'id': 701, 'name': 'cuba'}, {'id': 769, 'nam..."
