# Recommender systems datasets 

### OLD MovieLens DATASETS

    curl http://files.grouplens.org/datasets/movielens/ml-100k.zip -o ~/recsys/old/ml-100k.zip
    curl http://files.grouplens.org/datasets/movielens/ml-1m.zip   -o ~/recsys/old/ml-1m.zip
    curl http://files.grouplens.org/datasets/movielens/ml-10m.zip  -o ~/recsys/old/ml-10m.zip


### NEW MovieLens DATASETS

    curl http://files.grouplens.org/datasets/movielens/ml-latest-small.zip -o ~/recsys/latest/ml-latest-small.zip
    curl http://files.grouplens.org/datasets/movielens/ml-latest.zip       -o ~/recsys/latest/ml-latest.zip
    curl http://files.grouplens.org/datasets/movielens/ml-20m.zip          -o ~/recsys/latest/ml-20m.zip


# Common imports

In [1]:
import numpy as np
import pandas as pd
import scipy

# Database interactions

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.exc import ResourceClosedError
from sqlalchemy.types import VARCHAR
from functools import partial

def DatabaseConnect(username, password, host, schema):
    conn_str = "mysql+pymysql://{username}:{password}@{host}/{schema}?charset=utf8&use_unicode=1"\
                             .format(username=username, password=password, host=host, schema=schema)
    engine = create_engine(conn_str, pool_recycle=1800)
    return engine

#RecSysConnect = partial(DatabaseConnect, 'recsys', 'RecommenderSystems', 'localhost', 'recsys')
RecSysConnect = partial(DatabaseConnect, 'root', 'mysql-password','localhost','recsys')
e = RecSysConnect()
e

Engine(mysql+pymysql://root:***@localhost/recsys?charset=utf8&use_unicode=1)

### Alternative: the use embedded SQLite database (in Python standard library)

In [17]:
#import sqlite3
#def RecSysConnect(db='e:/recsys/recsys.sqlite'):
#    conn = sqlite3.connect(db)
#    return conn
#e = RecSysConnect()

# Reading data from files

### Old MovieLens 100K

In [3]:
from zipfile import ZipFile
zip_file = ZipFile('./old/ml-100k.zip')

In [4]:
# Ratings
ratings = pd.read_csv(zip_file.open('ml-100k/u.data'), sep='\t', encoding='latin-1', header=None)
ratings.columns = ['userId', 'movieId', 'rating', 'timestamp']
ratings.set_index(['userId','movieId'], inplace=True)
ratings.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


In [5]:
ratings.to_sql("ml100k_ratings", con=e, if_exists='replace', index=True)

In [6]:
# Genre
genre = pd.read_csv(zip_file.open('ml-100k/u.genre'), sep='|', encoding='latin-1', header=None)
genre.columns = ['genre', 'genreId']
genre.set_index(['genreId'], inplace=True)
genre.head()

Unnamed: 0_level_0,genre
genreId,Unnamed: 1_level_1
0,unknown
1,Action
2,Adventure
3,Animation
4,Children's


In [7]:
genre.to_sql("ml100k_genre", con=e, if_exists='replace', index=True)

In [8]:
# Item
item = pd.read_csv(zip_file.open('ml-100k/u.item'), sep='|', encoding='latin-1', header=None)
item.columns = [ 'movieId', 'movieTitle', 'releaseDate', 'videoReleaseDate', 'IMDbURL', \
                'unknown', 'Action',  'Adventure', 'Animation', \
                "Children's",  'Comedy', 'Crime', 'Documentary', 'Drama',  'Fantasy', \
                'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', \
                'Thriller', 'War', 'Western' ]
item.set_index(['movieId'], inplace=True)
item.head()

Unnamed: 0_level_0,movieTitle,releaseDate,videoReleaseDate,IMDbURL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [9]:
item.to_sql("ml100k_item", con=e, if_exists='replace', index=True)

In [10]:
# User
user = pd.read_csv(zip_file.open('ml-100k/u.user'), sep='|', encoding='latin-1', header=None)
user.columns = [ 'userId',  'age', 'gender', 'occupation', 'zipCode' ]
user.set_index(['userId'], inplace=True)
user.head()

Unnamed: 0_level_0,age,gender,occupation,zipCode
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


In [11]:
user.to_sql("ml100k_user", con=e, if_exists='replace', index=True)

In [12]:
# Occupation
occ = pd.read_csv(zip_file.open('ml-100k/u.occupation'), sep='|', encoding='latin-1', header=None)
occ.columns = [ 'occupation']
occ.set_index(['occupation'], inplace=True)
occ.head()

administrator
artist
doctor
educator
engineer


In [13]:
occ.to_sql("ml100k_occupation", con=e, if_exists='replace', index=True, dtype={'occupation': VARCHAR(100)})


### Old MovieLens 1M

In [14]:
import io
from zipfile import ZipFile
zip_file = ZipFile('./old/ml-1m.zip')

In [15]:
# Users
data = io.TextIOWrapper(zip_file.open('ml-1m/users.dat','r'))
users = pd.read_csv(data, sep='::', header=None, engine='python') 
users.columns = [ 'userId',  'gender', 'age', 'occupation', 'zipCode' ]
users.set_index(['userId'], inplace=True)
users.to_sql("ml1m_users", con=e, if_exists='replace', index=True)
users.head()

Unnamed: 0_level_0,gender,age,occupation,zipCode
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,F,1,10,48067
2,M,56,16,70072
3,M,25,15,55117
4,M,45,7,2460
5,M,25,20,55455


In [30]:
# Movies
data = io.TextIOWrapper(zip_file.open('ml-1m/movies.dat','r'),encoding='latin-1')
movies = pd.read_csv(data, sep='::', header=None, engine='python') 
movies.columns = [ 'movieId', 'title', 'genres' ]
movies.set_index(['movieId'], inplace=True)
#movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
movies.to_sql("ml1m_movies", con=e, if_exists='replace', index=True)
movies.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Animation|Children's|Comedy
2,Jumanji (1995),Adventure|Children's|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama
5,Father of the Bride Part II (1995),Comedy


In [31]:
# Ratings
data = io.TextIOWrapper(zip_file.open('ml-1m/ratings.dat','r'))
ratings = pd.read_csv(data, sep='::', header=None, engine='python') 
ratings.columns = [ 'userId', 'movieId', 'rating', 'timestamp' ]
ratings.set_index([ 'userId', 'movieId' ], inplace=True)
#movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
ratings.to_sql("ml1m_ratings", con=e, if_exists='replace', index=True)

### Old MovieLens 10M

In [33]:
from zipfile import ZipFile
zip_file = ZipFile('./old/ml-10m.zip')

In [34]:
# Movies
data = io.TextIOWrapper(zip_file.open('ml-10M100K/movies.dat','r'), encoding='utf-8')
movies = pd.read_csv(data, sep='::', header=None, engine='python') 
movies.columns = [ 'movieId', 'title', 'genres' ]
movies.set_index(['movieId'], inplace=True)
#movies['genres'] = movies['genres'].apply(lambda x: x.split("|"))
movies.to_sql("ml10m_movies", con=e, if_exists='replace', index=True)

In [35]:
# Tags
data = io.TextIOWrapper(zip_file.open('ml-10M100K/tags.dat','r'), encoding='utf-8')
tags = pd.read_csv(data, sep='::', header=None, engine='python') 
tags.columns = [ 'userId', 'movieId', 'tag', 'timestamp' ]
tags.set_index([ 'userId', 'movieId' ], inplace=True)
tags.to_sql("ml10m_tags", con=e, if_exists='replace', index=True)

In [36]:
# Ratings: beware - takes several minutes
data = io.TextIOWrapper(zip_file.open('ml-10M100K/ratings.dat','r'), encoding='utf-8')
ratings = pd.read_csv(data, sep='::', header=None, engine='python') 
ratings.columns = [ 'userId', 'movieId', 'rating', 'timestamp' ]
ratings.set_index([ 'userId', 'movieId' ], inplace=True)
ratings.to_sql("ml10m_ratings", con=e, if_exists='replace', index=True)