# Week 5

# Getting Started with Pandas (Part 2)
# Data Analysis with Data Frames

In this week, we will look into a data set about movies and use data frames to perform data analysis.

In [None]:
import pandas as pd

## 1. Download and extract MovieLens data

- [MovieLens](https://grouplens.org/datasets/movielens/) is a web-based recommender system and virtual community that recommends movies for its users to watch, based on their file preferences using collaborative filtering of members' movie ratings and movie reviews. 
- We will look at its `ml-latest-small` dataset.

In [None]:
# Download ml-latest-small.zip from https://grouplens.org/datasets/movielens/
import os 
import urllib.request
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"

file_path = "Data/" # This is a relative path from the default folder
zip_file = "ml-latest-small.zip"

# Create the Data folder if it doesn't already exist.
if not os.path.isfile(file_path):
    os.mkdir(file_path)

# Download file
urllib.request.urlretrieve(url, file_path + zip_file)

In [None]:
# Decompress the zip file
import zipfile
f = zipfile.ZipFile(file_path + zip_file, "r")
f.printdir()
f.extractall(file_path)
f.close()

In [None]:
# Check that files has been unzipped
print(os.listdir(file_path + "ml-latest-small"))

In [None]:
# Delete the zip file
os.remove(file_path + zip_file)

In [None]:
# Verify that the zip file is gone.
os.listdir(file_path)

In [None]:
# Write a function to download and decompress file
import os
import urllib.request
import zipfile

def get_movielens(file_path, file_name, delete_zip_file=False):
    url = "http://files.grouplens.org/datasets/movielens/" + file_name
    if not os.path.exists(file_path):
        os.mkdir(file_path)
    if not os.path.exists(file_path + file_name):
        urllib.request.urlretrieve(url, file_path + file_name)
        print("File", file_name, "downloaded.")
    with zipfile.ZipFile(file_path + file_name, "r") as f:
        f.extractall(file_path)
        print("Files extracted:")
        print(f.printdir())
    if delete_zip_file:
        os.remove(file_path + file_name)

In [None]:
get_movielens(file_path, zip_file)
# Remember file_path = "Data/" and zip_file = "ml-latest-small.zip"

## 2. Load ratings data as a pandas DataFrame
- Load csv file as a DataFrame
- head()
- shape, columns, dtypes
- indexing with loc[]
- Handle date and time data
- Data transformation with apply()

In [None]:
import pandas as pd

ratings_df = pd.read_csv("Data/ml-latest-small/ratings.csv")
ratings_df.head()  # Show the first several rows of the dataset

In [None]:
# Ex: How many records are there?



In [None]:
# Ex: Display the types of each feature



In [None]:
# Convert time stamp to a readable format
from datetime import datetime

# Example: convert the first time stamp
timestamp = ratings_df.loc[0, "timestamp"]
print(timestamp)

dt = datetime.fromtimestamp(timestamp)
print(dt)

print(dt.year)
print(dt.month)

ratings_df.loc[0, "Year"] = dt.year
# # ratings_df.loc[0, "Month"] = dt.month
# # ratings_df.loc[0, "Day"] = dt.day
# # ratings_df.loc[0, "Hour"] = dt.hour
# # ratings_df.loc[0, "Minute"] = dt.minute
# # ratings_df.loc[0, "Second"] = dt.second
ratings_df.head()

In [None]:
# Use apply() to apply a function to all rows
ratings_df["DT"] = ratings_df["timestamp"].apply(datetime.fromtimestamp)
ratings_df.head()

We can also create functions to fill column year, month, ...

In [None]:
# Ex: Fill the year column



Alternatively, we can use `pd.to_datetime()` for the conversion. [Link](https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html)

In [None]:
# Ex: Use pd.to_datetime() to perform the conversion.



## 3. Save the processed ratings data

Data can be saved in many different formats. Today we will save the processed rating data as a new CSV file.

In [None]:
# Save the processed data as a .csv file
csv_file = "Data/ml-latest-small/ratings_processed.csv"
ratings_df.to_csv(csv_file)

In [None]:
# Ex: Load the saved file and display its first 5 rows.



## 4. Analyzing MovieLens Data
- Distribution of movie ratings
- Popular movies

In [None]:
# Use value_counts() to find the rating frequencies
rating_frequencies = ratings_df['rating'].value_counts()
print(rating_frequencies)

In [None]:
# Find sorted the rating frequencies
rating_frequencies = ratings_df['rating'].value_counts().sort_index(ascending=False)
print(rating_frequencies)

In [None]:
# Visualize rating frequencies as a histogram
ratings_df['rating'].hist()

In [None]:
# When were these ratings created?
import numpy as np

print("Earliest year of rating:", ratings_df['Year'].min())

print("Latest year of rating:", np.max(ratings_df['Year'])) # .max() also works

print("The entire set of years:", ratings_df['Year'].value_counts().sort_index())

In [None]:
# How are rating frequencies distributed in 2018?

# Create a filter in two steps
filter_2018 = (ratings_df['Year'] == 2018)
# print(filter_2018)
ratings_2018 = ratings_df[filter_2018]
ratings_2018.head()

In [None]:
# Create a filter in one step:
ratings_2018 = ratings_df[ratings_df['Year'] == 2018]
ratings_2018.head()

In [None]:
ratings_2018['rating'].hist()

In [None]:
# Ex: How are rating frequencies distributed in 2003?



Next, let's identify some popular movies.

In [None]:
ratings_df.head()

In [None]:
# Ex: Which movie received the highest number of ratings?



In [None]:
# Ex: Which movies received the largest number of 5.0s?




In [None]:
# What is the name of the movie? (The answer requires another data file.)

# Load movies.csv
movies_df = pd.read_csv('Data/ml-latest-small/movies.csv')
movies_df.head()

In [None]:
# Ex: Find out the title corresponding to the movies with high number of ratings.



In [None]:
# Which movie receives the most 5.0s?



In [None]:
# Find the name of this movie



In [None]:
# Which movie has the highest average rating?



In [None]:
# Find the top ten highly-rated movies.

