In [9]:
import pandas as pd
from datetime import datetime
import numpy as np

In [10]:
path_ratings = "/home/codebind/movieLens/data/ml-100k/u.data"
path_movies = "/home/codebind/movieLens/data/ml-100k/u.item"
path_users = "/home/codebind/movieLens/data/ml-100k/u.user"

ratings_data = pd.read_csv(path_ratings, sep = '\t', header = None)
movies_data = pd.read_csv(path_movies, sep = '|', header = None, engine = 'python', encoding ="iso-8859-1")
users_data = pd.read_csv(path_users, sep = '|', header = None)

In [11]:
movies_header= ["Movie Id", "Movie Title", "Release Date", "Video Release Date",
                  "IMBd Url", "unknown", "Action", "Adventure", "Animation",
                   "Childen's", "Comedy", "Crime", "Documentary", "Drama", "Fantasy",
                   "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-fi",
                   "Thriller", "War", "Western"]
ratings_header= ["User Id", "Movie Id", "Rating", "Timestamp"]
users_header= ["User Id", "Age", "Gender", "Occupation", "Zip Code"]

In [12]:
ratings_data.columns = ratings_header
movies_data.columns = movies_header
users_data.columns = users_header

# Print top 10 movies  that recieved most ratings sort by number of ratings

In [16]:
movies = movies_data[["Movie Id", "Movie Title"]]
group = ratings_data[["Movie Id", "Rating"]].groupby("Movie Id")
mostRatings= group.count().nlargest(10,"Rating")
pd.merge(mostRatings, movies, on="Movie Id")

Unnamed: 0,Movie Id,Rating,Movie Title
0,50,583,Star Wars (1977)
1,258,509,Contact (1997)
2,100,508,Fargo (1996)
3,181,507,Return of the Jedi (1983)
4,294,485,Liar Liar (1997)
5,286,481,"English Patient, The (1996)"
6,288,478,Scream (1996)
7,1,452,Toy Story (1995)
8,300,431,Air Force One (1997)
9,121,429,Independence Day (ID4) (1996)


# Print a  list of the number of ratings by each genre

In [21]:
merged = pd.merge(ratings_data, movies_data, left_on="Movie Id", right_on="Movie Id").reindex()
merged[merged.columns[8:]].sum()

unknown           10
Action         25589
Adventure      13753
Animation       3605
Childen's       7182
Comedy         29832
Crime           8055
Documentary      758
Drama          39895
Fantasy         1352
Film-Noir       1733
Horror          5317
Musical         4954
Mystery         5245
Romance        19461
Sci-fi         12730
Thriller       21872
War             9398
Western         1854
dtype: int64

# Print the oldest movie with a "5" rating

In [28]:
ratings = ratings_data[["Movie Id", "Rating"]]
merged = pd.merge(ratings, movies_data, left_on="Movie Id", right_on="Movie Id").reindex()
merged= merged[["Movie Id","Movie Title","Rating", "Release Date"]]
merged=merged.loc[merged["Rating"]==5]
merged["Release Date"]=pd.to_datetime(merged["Release Date"])
merged= merged.sort_values(by="Release Date")
print(merged["Movie Title"].iloc[1])

Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922)


# Print a list of  the genres of the top 10 most rated movies

In [39]:
merged=pd.merge(mostRatings, movies_data,left_on="Movie Id", right_on="Movie Id")
genres= merged.loc[:,"unknown":"Western"]
merged["Genres"]= 1

movies= ratings_data["Movie Id"].value_counts().index[:10]
for i in movies:
    title= movies_data[movies_data["Movie Id"]== i]["Movie Title"].values[0]
    genres= movies_data[movies_data["Movie Id"]==i].iloc[:,5:]
    list= genres.columns[(genres ==1).values[0]]
    print("{:30} genres: {}" .format(title, ",".join(list)))

Star Wars (1977)               genres: Action,Adventure,Romance,Sci-fi,War
Contact (1997)                 genres: Drama,Sci-fi
Fargo (1996)                   genres: Crime,Drama,Thriller
Return of the Jedi (1983)      genres: Action,Adventure,Romance,Sci-fi,War
Liar Liar (1997)               genres: Comedy
English Patient, The (1996)    genres: Drama,Romance,War
Scream (1996)                  genres: Horror,Thriller
Toy Story (1995)               genres: Animation,Childen's,Comedy
Air Force One (1997)           genres: Action,Thriller
Independence Day (ID4) (1996)  genres: Action,Sci-fi,War


# Print the title of the movie that was rated the most by students

In [44]:
students= users_data.loc[users_data["Occupation"]=="student"]["User Id"]
ratings=ratings_data[ratings_data["User Id"].isin(students)]
movie=int(ratings_data["Movie Id"].value_counts().head(1))
movies_data.loc[movies_data["Movie Id"]== movie]["Movie Title"]

582    Romeo Is Bleeding (1993)
Name: Movie Title, dtype: object

# Print the list of movies that recieved the highest numbber of "5" rating

In [45]:
highest5= ratings_data.loc[ratings_data["Rating"]==5]["Movie Id"].value_counts().index
for i in highest5[:10]:
    print(movies_data[movies_data["Movie Id"]== i]["Movie Title"].values[0])

Star Wars (1977)
Fargo (1996)
Godfather, The (1972)
Raiders of the Lost Ark (1981)
Pulp Fiction (1994)
Schindler's List (1993)
Silence of the Lambs, The (1991)
Titanic (1997)
Empire Strikes Back, The (1980)
Return of the Jedi (1983)


# Print list of zip codes correspond to the highest number of users that rated movies

In [53]:
user= set(ratings_data["User Id"])
rated_user= users_data[users_data["User Id"].isin(user)]
group= rated_user.groupby ("Zip Code")["User Id"].count()
zip= group.nlargest(10)
print(zip)

Zip Code
55414    9
55105    6
10003    5
20009    5
55337    5
27514    4
55408    4
55454    4
02215    3
10021    3
Name: User Id, dtype: int64


# Find the most rated movie by users in the age group 20 to 25

In [55]:
users = users_data[users_data["Age"].between(20,25)]
user_id= users["User Id"]
ratings= ratings_data[ratings_data["User Id"].isin(user_id)]
movie_id= ratings["Movie Id"].value_counts().index[0]
movie= movies_data[movies_data["Movie Id"]==movie_id]["Movie Title"]
print(movie)

287    Scream (1996)
Name: Movie Title, dtype: object


# Print the list of movies that were rated after year 1960

In [61]:
ratings_data["Date"]= ratings_data["Timestamp"].apply(lambda x: datetime.fromtimestamp(x))
ratings= ratings_data[ratings_data["Date"] . datetime(1960,12,31)]
movie_Ids=ratings["Movie Id"]
movies=movies_data[movies_data["Movie Id"].isin(movie_Ids)]
print(movies["Movie Title"][:10])

AttributeError: 'Series' object has no attribute 'datetime'