# Chapter 5
# Getting Started with Pandas (Part 1)

Pandas is a major tool of data scientists. It contains data structures and data manipulation tools designed to make data cleaning and analysis fast and easy.

In [None]:
import pandas as pd

## 1. Download and extract MovieLens data

- [GroupLens Research](grouplens.org) is a human-computer interaction research lab at University of Minnesota, Twin Cities. 
- [MovieLens](https://grouplens.org/datasets/movielens/) is a web-based recommender system and virtual community that recommends movies for its users to watch, based on their file preferences using collaborative filtering of members' movie ratings and movie reviews. 

In [None]:
# Download ml-latest-small.zip from https://grouplens.org/datasets/movielens/
import urllib.request
url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
file_path = "Data/"
zip_file = "ml-latest-small.zip"
urllib.request.urlretrieve(url, file_path + zip_file)

In [None]:
# Decompress the zip file
import zipfile
f = zipfile.ZipFile(file_path + zip_file, "r")
f.printdir()
f.extractall(file_path)
f.close()

In [None]:
# Check that files has been unzipped
import os
print(os.listdir(file_path + "ml-latest-small"))

In [None]:
# Delete the zip file
os.remove(file_path + zip_file)
os.listdir(file_path)

In [None]:
# Write a function to download and decompress file
import os
import urllib.request
import zipfile

def get_movielens(file_path, file_name, delete_zip_file=False):
    url = "http://files.grouplens.org/datasets/movielens/" + file_name
    if not os.path.exists(file_path + file_name):
        urllib.request.urlretrieve(url, file_path + file_name)
        print("File", file_name, "downloaded.")
    with zipfile.ZipFile(file_path + file_name, "r") as f:
        f.extractall(file_path)
        print("Files extracted:")
        print(f.printdir())
    if delete_zip_file:
        os.remove(file_path + file_name)

In [None]:
get_movielens(file_path, zip_file)

In [None]:
# A larger dataset is ml-20m
# get_movielens(file_path, "ml-20m.zip")

## 2. Load ratings data as a pandas DataFrame
- Load csv file as a DataFrame
- head()
- shape, columns, dtypes
- indexing with loc[]
- Handle date and time data
- Data transformation with apply()

In [None]:
import pandas as pd
ratings_df = pd.read_csv("Data/ml-latest-small/ratings.csv",
                         delimiter=",")
ratings_df.head()  # Show the first several rows of the dataset

In [None]:
print("Columns:", ", ".join(ratings_df.columns))

In [None]:
print("Shape:", ratings_df.shape)

In [None]:
print("Data types:\n", ratings_df.dtypes)

In [None]:
# Convert time stamp to a readable format

# Example: convert the first time stamp
timestamp = ratings_df.loc[0, "timestamp"]
print(timestamp)

from datetime import datetime
dt = datetime.fromtimestamp(timestamp)
print(dt)

ratings_df.loc[0, "Year"] = dt.year
ratings_df.loc[0, "Month"] = dt.month
ratings_df.loc[0, "Day"] = dt.day
ratings_df.loc[0, "Hour"] = dt.hour
ratings_df.loc[0, "Minute"] = dt.minute
ratings_df.loc[0, "Second"] = dt.second
ratings_df.head()

In [None]:
# Use apply() to apply a function to all rows
ratings_df["DT"] = ratings_df["timestamp"].apply(datetime.fromtimestamp)
ratings_df.head()

In [None]:
# apply() can be used to apply user-defined functions

# Exercise: define a function that convert month to Q1 - Q4, and 
# use this function to create a "Quarter" column

In [None]:
# Fill the year column
ratings_df['Year'] = ratings_df['DT'].dt.year

# 3. Save the processed ratings data

Data can be saved in many different formats. Today we will study two commonly used formats: csv and feather.
- csv: High readability and stable format
- feather: High read and write performance

In [None]:
# Save the processed data as a .csv file
csv_file = "Data/ml-latest-small/ratings_processed.csv"
ratings_df.to_csv(csv_file)

In [None]:
# Use Python package installer pip to install feather
!pip install --upgrade pip
!pip install feather-format

In [None]:
import feather
# Save the processed data as a .feather file
feather_file = "Data/ml-latest-small/ratings_processed.feather"
feather.write_dataframe(ratings_df, feather_file)

In [None]:
# Load .feather file as DataFrame
# ratings_df2 = feather.read_dataframe(feather_file)
ratings_df2 = pd.read_feather(feather_file)
ratings_df2.head()

In [None]:
# Time the writing speed
import time
start = time.time()
ratings_df.to_csv(csv_file)
end = time.time()
print("Writing to a csv file costs:", (end - start))
print("File size:", os.path.getsize(csv_file))

start = time.time()
ratings_df.to_feather(feather_file)
end = time.time()
print("Writing to a feather file costs:", (end - start))
print("File size:", os.path.getsize(feather_file))

In [None]:
# Exercise: Compare the loading time of csv and feather format



## 4. pd.Series and pd.DataFrame

### Series

In [None]:
# Two important pandas data structure: Series (1D) and DataFrame (2D)
userId = ratings_df['userId']
# ?userId
# ?ratings_df

In [None]:
print(userId.values[:20])

In [None]:
# Exercise: How many unique user IDs are there?


In [None]:
# Index of userId
print(userId.index[:20])

In [None]:
# Define a Series from scratch
series = pd.Series([4, 7, -5, 3],
                     index=['d', 'b', 'a', 'c'])
series

In [None]:
# slicing a Series
print("series['a']:")
print(series['a'])
print("\nseries[['a', 'b', 'c']]:")
print(series[['a', 'b', 'c']])
print("\nseries[series > 0]:")
print(series[series > 0])

In [None]:
# Arithmetic operations
print("series * 2:")
print(series * 2)

In [None]:
# Define a Series from a dictionary
dictionary = {'Ohio': 35000,
              'Texas': 71000,
              'Oregon': 16000,
              'Utah': 5000}
series2 = pd.Series(dictionary)
series2

In [None]:
# Add values
series2['New York'] = 12345
series2

In [None]:
# Remove values
series2.drop('New York', inplace=True)
series2

### DataFrame

In [None]:
# Define a DataFrame from scratch
df1 = pd.DataFrame(np.random.rand(5, 3),
                   columns=['Feature1', 'Feature2', 'Feature3'])
df1.head()

In [None]:
# Add values
df1.loc[1, 'Feature4'] = 1.234
df1

In [None]:
# Identify null values
df1.isnull()

In [None]:
# Cound number of missing values for each column
np.sum(df1.isnull(), axis=0)

In [None]:
# Remove rows
df1.drop([0, 1], axis=0, inplace=True)
df1

In [None]:
# Reset index
df1.reset_index(inplace=True)
df1

In [None]:
# Remove columns
df1.drop(['Feature4'], axis=1, inplace=True)
df1

In [None]:
# Exercise: Add one row to ratings_df and remove it.



## 5. Analyzing MovieLens Data

In [None]:
# Use value_counts() to find the rating frequencies
rating_frequencies = ratings_df['rating'].value_counts()
print(rating_frequencies)

In [None]:
np.unique(ratings_df['Year'])

In [None]:
# What are the rating frequencies in 2018?
index_2018 = (ratings_df['Year'] == 2018)
rating_freq_2018 = ratings_df[index_2018]['rating'].value_counts().sort_index()
print(rating_freq_2018)

In [None]:
# What are the rating frequencies in 2003?
index_2003 = (ratings_df['Year'] == 2003)
rating_freq_2003 = ratings_df[index_2003]['rating'].value_counts().sort_index()
print(rating_freq_2003)

In [None]:
# Do movies ratings inflate over time?
import matplotlib.pyplot as plt
%matplotlib inline
xs = np.arange(0.5, 5.1, 0.5)
plt.plot(xs,
         rating_freq_2003 / np.sum(rating_freq_2003),
         'b.',
         label="2003")
plt.plot(xs,
         rating_freq_2018 / np.sum(rating_freq_2018),
         'g^',
         label='2018')
plt.legend()

In [None]:
# How many movies were rated in 2003? in 2018?


In [None]:
# How many ratings were submitted per year?
n_ratings = []
years = np.unique(ratings_df['Year'])
for year in years:
    index_year = (ratings_df['Year'] == year)
    n_ratings.append(ratings_df[index_year].shape[0])
plt.plot(years, n_ratings, 'b--')

In [None]:
# How many users gave ratings per year?
n_users = []

for year in years:
    index_year = (ratings_df['Year'] == year)
    n_users.append(np.unique(ratings_df.loc[index_year, 'userId']).shape)

plt.plot(years, n_users, 'b-')


In [None]:
# How many moviews were rated on MovieLens in 2003? in 2018?

In [None]:
# How consistent are the average ratings over time?


In [None]:
# Exercise: for each movie, find:
# 1. how many ratings were given to this movie
# 2. What is the highest and lowest rating
# 3. The frequency of each rating
# 4. When was the first rating given?
# 5. When was the last rating given?
# 6. What is the average rating?