In [1]:
import pandas as pd
from datetime import datetime
import numpy as np
pd.options.display.max_columns = None

In [82]:
ratings_path = "/home/giedrius/Documents/ml-100k/u.data"
movies_path = "/home/giedrius/Documents/ml-100k/u.item"
user_path = "/home/giedrius/Documents/ml-100k/u.user"

ratings_df = pd.read_csv(ratings_path, sep = '\t', header = None)
movies_df = pd.read_csv(movies_path, sep = "|", header = None, encoding ="iso-8859-1")
user_df = pd.read_csv(user_path, sep = '|', header = None)

ratings_df.columns = ["user id", "item id", "rating", "timestamp"]
user_df.columns = ["user id", "age","gender", "occupation", "zip code"]
movies_df.columns =  ['movie id', 'movie title', 'release date', 'video release date', 'IMDb URL', 'unknown',
 'Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir',
 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']


movies_df['release date'] = movies_df['release date'].apply(lambda x: datetime.strptime(x,'%d-%b-%Y') if isinstance(x, str) else x)
ratings_df["date"]= ratings_df["timestamp"].apply(lambda x: datetime.fromtimestamp(x))



### 1) and 2)Print a list of the 10 movies that received the most number of ratings, sorted by the number of ratings. 

In [3]:
movies_ids = ratings_df["item id"].value_counts().index[:10]
df_1 = movies_df[movies_df['movie id'].isin(movies_ids)].reset_index()
list(df_1["movie title"])

['Toy Story (1995)',
 'Star Wars (1977)',
 'Fargo (1996)',
 'Independence Day (ID4) (1996)',
 'Return of the Jedi (1983)',
 'Contact (1997)',
 'English Patient, The (1996)',
 'Scream (1996)',
 'Liar Liar (1997)',
 'Air Force One (1997)']

### 3) Print a list of the number of ratings received by each genre. 

In [4]:
df = pd.merge(ratings_df, movies_df, left_on="item id", right_on="movie id").reindex()
df[df.columns[9:]].sum()

unknown           10
Action         25589
Adventure      13753
Animation       3605
Children's      7182
Comedy         29832
Crime           8055
Documentary      758
Drama          39895
Fantasy         1352
Film-Noir       1733
Horror          5317
Musical         4954
Mystery         5245
Romance        19461
Sci-Fi         12730
Thriller       21872
War             9398
Western         1854
dtype: int64

### 4) Print the oldest movie with a “5” rating. 

In [78]:
movies_ids_5 = set(ratings_df[ratings_df["rating"] == 5]["item id"].values)
five_star_movies = movies_df[movies_df['movie id'].isin(movies_ids_5)]
five_star_movies.sort_values("release date",ascending=True)["movie title"].values[0]

'Nosferatu (Nosferatu, eine Symphonie des Grauens) (1922)'

### 5) Print a list of the genre of the top 10 most rated movies. 

In [7]:
for i in df_1.index:
    genres_row = df_1.iloc[i,5:]
    genres_list = list(genres_row[genres_row == 1].index)
    print("{:30} genres : {}".format(df_1["movie title"][i],", ".join(genres_list)))

Toy Story (1995)               genres : Animation, Children's, Comedy
Star Wars (1977)               genres : Action, Adventure, Romance, Sci-Fi, War
Fargo (1996)                   genres : Crime, Drama, Thriller
Independence Day (ID4) (1996)  genres : Action, Sci-Fi, War
Return of the Jedi (1983)      genres : Action, Adventure, Romance, Sci-Fi, War
Contact (1997)                 genres : Drama, Sci-Fi
English Patient, The (1996)    genres : Drama, Romance, War
Scream (1996)                  genres : Horror, Thriller
Liar Liar (1997)               genres : Comedy
Air Force One (1997)           genres : Action, Thriller


### 6) Print the title of the movie that was rated the most by students 

In [8]:
student_user_ids = user_df[user_df["occupation"] == "student"]["user id"]
student_ratings = ratings_df[ratings_df["user id"].isin(student_user_ids)]
stud_movie = int(ratings_df["item id"].value_counts().head(1))
movies_df[movies_df['movie id'] == stud_movie]["movie title"].values[0]

'Romeo Is Bleeding (1993)'

### 7) Print the list of movies that received the highest number of “5” rating 

In [17]:
movie_ids = df[df["rating"] == 5]["item id"].value_counts()
print(movies_df[movies_df['movie id'].isin(movie_ids)]["movie title"][:10])

0                                     Toy Story (1995)
1                                     GoldenEye (1995)
2                                    Four Rooms (1995)
3                                    Get Shorty (1995)
4                                       Copycat (1995)
5    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6                                Twelve Monkeys (1995)
7                                          Babe (1995)
8                              Dead Man Walking (1995)
9                                   Richard III (1995)
Name: movie title, dtype: object


### 8) Print the list of zip codes corresponding to the highest number of users that rated movies. 

In [45]:
user_ids = set(ratings_df["user id"])
rated_users = user_df[user_df['user id'].isin(user_ids)] #all users have rated at least one movie
grouped = rated_users.groupby("zip code")["user id"].count()
grouped.nlargest(10)

zip code
55414    9
55105    6
10003    5
20009    5
55337    5
27514    4
55408    4
55454    4
02215    3
10021    3
Name: user id, dtype: int64

### 9) Find the most rated movie by users in the age group 20 to 25. 

In [62]:
user_ids = user_df[user_df["age"].between(20, 25)]["user id"]
movie_id = ratings_df[ratings_df["user id"].isin(user_ids)]["item id"].value_counts().index[0]
movies_df[movies_df["movie id"] == movie_id]["movie title"].values[0]

'Scream (1996)'

### 10) Print the list of movies that were rate after year 1960. 

In [100]:
movie_ids = ratings_df[ratings_df["date"] > datetime(1960,12,31)]["item id"] #all movies rated in 1997-1978
movies_df[movies_df["movie id"].isin(movie_ids)]["movie title"][:10]

0                                     Toy Story (1995)
1                                     GoldenEye (1995)
2                                    Four Rooms (1995)
3                                    Get Shorty (1995)
4                                       Copycat (1995)
5    Shanghai Triad (Yao a yao yao dao waipo qiao) ...
6                                Twelve Monkeys (1995)
7                                          Babe (1995)
8                              Dead Man Walking (1995)
9                                   Richard III (1995)
Name: movie title, dtype: object