# Chapter 5
# Getting Started with Pandas (Part 1)

Pandas is a major tool of data scientists. It contains data structures and data manipulation tools designed to make data cleaning and analysis fast and easy.

In [None]:
import pandas as pd

## 1. Download and extract MovieLens data

- [GroupLens Research](https://grouplens.org) is a human-computer interaction research lab at University of Minnesota, Twin Cities. 
- [MovieLens](https://grouplens.org/datasets/movielens/) is a web-based recommender system and virtual community that recommends movies for its users to watch, based on their file preferences using collaborative filtering of members' movie ratings and movie reviews. 

In [None]:
# Download ml-latest-small.zip from https://grouplens.org/datasets/movielens/
import urllib.request
url = "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
file_path = "Data/"
zip_file = "ml-latest-small.zip"
urllib.request.urlretrieve(url, file_path + zip_file)

In [None]:
# Decompress the zip file
import zipfile
f = zipfile.ZipFile(file_path + zip_file, "r")
f.printdir()
f.extractall(file_path)
f.close()

In [None]:
# Since we only need the zip file in this step, 
# we can use the following strcuture:
with zipfile.ZipFile(file_path + zip_file, "r") as f:
    f.printdir()
    f.extractall(file_path)
# This cell does exactly the same thing as the above cell.

In [None]:
# Check that files has been unzipped
import os
print(os.listdir(file_path + "ml-latest-small"))

In [None]:
# Delete the zip file
os.remove(file_path + zip_file)
os.listdir(file_path)

In [1]:
# Write a function to download and decompress file
import os
import urllib.request
import zipfile

def get_movielens(file_path, file_name, delete_zip_file=False):
    url = "http://files.grouplens.org/datasets/movielens/" + file_name
    if not os.path.exists(file_path + file_name):
        urllib.request.urlretrieve(url, file_path + file_name)
        print("File", file_name, "downloaded.")
    with zipfile.ZipFile(file_path + file_name, "r") as f:
        f.extractall(file_path)
        print("Files extracted:")
        print(f.printdir())
    if delete_zip_file:
        os.remove(file_path + file_name)

In [7]:
file_path = "Data/"
zip_file = "ml-latest-small.zip"
get_movielens(file_path, zip_file)

File ml-latest-small.zip downloaded.
Files extracted:
File Name                                             Modified             Size
ml-latest-small/                               2018-09-26 15:50:12            0
ml-latest-small/links.csv                      2018-09-26 15:50:10       197979
ml-latest-small/tags.csv                       2018-09-26 15:49:40       118660
ml-latest-small/ratings.csv                    2018-09-26 15:49:38      2483723
ml-latest-small/README.txt                     2018-09-26 15:50:12         8342
ml-latest-small/movies.csv                     2018-09-26 15:49:56       494431
None


In [8]:
os.listdir(file_path)

['cereals', 'creditcardfraud', 'ml-latest-small', 'ml-latest-small.zip']

In [None]:
# A larger dataset is ml-20m
# get_movielens(file_path, "ml-20m.zip")

## 2. Load ratings data as a pandas DataFrame
- Load csv file as a DataFrame
- head()
- shape, columns, dtypes
- indexing with loc[]
- Handle date and time data
- Data transformation with apply()

In [14]:
import pandas as pd
ratings_df = pd.read_csv("Data/ml-latest-small/ratings.csv",
                         delimiter=",")
ratings_df.head()  # Show the first several rows of the dataset

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [15]:
ratings_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352
100835,610,170875,3.0,1493846415


In [16]:
print("Columns:", ", ".join(ratings_df.columns))

Columns: userId, movieId, rating, timestamp


In [17]:
print("Shape:", ratings_df.shape)

Shape: (100836, 4)


In [18]:
print("Data types:\n", ratings_df.dtypes)

Data types:
 userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object


In [22]:
# Convert time stamp to a readable format

# Example: convert the first time stamp
timestamp = ratings_df.loc[0, "timestamp"]
print(timestamp)

from datetime import datetime
dt = datetime.fromtimestamp(timestamp)
print(dt)

ratings_df.loc[0, "Year"] = dt.year
ratings_df.loc[0, "Month"] = dt.month
ratings_df.loc[0, "Day"] = dt.day
ratings_df.loc[0, "Hour"] = dt.hour
ratings_df.loc[0, "Minute"] = dt.minute
ratings_df.loc[0, "Second"] = dt.second
ratings_df.head()

964982703
2000-07-30 14:45:03


Unnamed: 0,userId,movieId,rating,timestamp,Year,Month,Day,Hour,Minute,Second
0,1,1,4.0,964982703,2000.0,7.0,30.0,14.0,45.0,3.0
1,1,3,4.0,964981247,,,,,,
2,1,6,4.0,964982224,,,,,,
3,1,47,5.0,964983815,,,,,,
4,1,50,5.0,964982931,,,,,,


In [26]:
# change the format of datetime
print(dt)
dt2 = dt.strftime('%Y - %b - %d')
print(dt2)

2000-07-30 14:45:03
2000 - Jul - 30


In [27]:
# Use apply() to apply a function to all rows
ratings_df["DT"] = ratings_df["timestamp"].apply(datetime.fromtimestamp)
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,Year,Month,Day,Hour,Minute,Second,DT
0,1,1,4.0,964982703,2000.0,7.0,30.0,14.0,45.0,3.0,2000-07-30 14:45:03
1,1,3,4.0,964981247,,,,,,,2000-07-30 14:20:47
2,1,6,4.0,964982224,,,,,,,2000-07-30 14:37:04
3,1,47,5.0,964983815,,,,,,,2000-07-30 15:03:35
4,1,50,5.0,964982931,,,,,,,2000-07-30 14:48:51


In [29]:
# Fill the year column
ratings_df['Year'] = ratings_df['DT'].dt.year
ratings_df['Month'] = ratings_df['DT'].dt.month
ratings_df['Day'] = ratings_df['DT'].dt.day

ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp,Year,Month,Day,Hour,Minute,Second,DT
0,1,1,4.0,964982703,2000,7,30,14.0,45.0,3.0,2000-07-30 14:45:03
1,1,3,4.0,964981247,2000,7,30,,,,2000-07-30 14:20:47
2,1,6,4.0,964982224,2000,7,30,,,,2000-07-30 14:37:04
3,1,47,5.0,964983815,2000,7,30,,,,2000-07-30 15:03:35
4,1,50,5.0,964982931,2000,7,30,,,,2000-07-30 14:48:51


In [33]:
# apply() can be used to apply user-defined functions

# Exercise: define a function that convert month to Q1 - Q4, and 
# use this function to create a "Quarter" column:

def convertMonToQuarter(month):
    if(month < 4):
        quarter = 1
    if (month>= 4 and month<7):
        quarter = 2
    if (month >= 7 and month < 9):
        quarter = 3
    if(month >= 9):
        quarter = 4
    return quarter    
convertMonToQuarter(8)

3

In [35]:
ratings_df["Quarter"] = ratings_df['Month'].apply(convertMonToQuarter)

In [37]:
ratings_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp,Year,Month,Day,Hour,Minute,Second,DT,Quarter
100831,610,166534,4.0,1493848402,2017,5,3,,,,2017-05-03 17:53:22,2
100832,610,168248,5.0,1493850091,2017,5,3,,,,2017-05-03 18:21:31,2
100833,610,168250,5.0,1494273047,2017,5,8,,,,2017-05-08 15:50:47,2
100834,610,168252,5.0,1493846352,2017,5,3,,,,2017-05-03 17:19:12,2
100835,610,170875,3.0,1493846415,2017,5,3,,,,2017-05-03 17:20:15,2


In [38]:
df = pd.DataFrame({"Test1": [70, 80, 90],
                   "Test2": [75, 85, 95],
                   "Test3": [80, 90, 100]})
df.head()

Unnamed: 0,Test1,Test2,Test3
0,70,75,80
1,80,85,90
2,90,95,100


In [46]:
import numpy as np
# Exercise: apply np.sqrt() to calculate the square root of test 1 scores
df['Test1sq'] = df['Test1'].apply(np.sqrt)
df.head()

# Exercise: apply np.mean() to create an "Average" column
df['Mean'] = df.apply(np.mean, axis=1)
df.head()

# Exercise: create a "Max" column that stores the maximum value from each row
df['Max'] = df.apply(np.max, axis=1)
df.head()

# Without axis=1, the function is applied to each column of the data frame.
print(df.apply(np.max))

Test1       90.000000
Test2       95.000000
Test3      100.000000
Test1sq      9.486833
Mean        78.018090
Max        100.000000
dtype: float64


# 3. Save the processed ratings data

Data can be saved in many different formats. Today we will study two commonly used formats: csv and feather.
- csv: High readability and stable format
- feather: High read and write performance

In [47]:
# Save the processed data as a .csv file
csv_file = "Data/ml-latest-small/ratings_processed.csv"
ratings_df.to_csv(csv_file)

In [49]:
# Use Python package installer pip to install feather
!pip install --upgrade pip
!pip install feather-format

Requirement already up-to-date: pip in c:\users\ch002\anaconda3\lib\site-packages (19.2.3)
Collecting feather-format
  Using cached https://files.pythonhosted.org/packages/08/55/940b97cc6f19a19f5dab9efef2f68a0ce43a7632f858b272391f0b851a7e/feather-format-0.4.0.tar.gz
Collecting pyarrow>=0.4.0 (from feather-format)
  Downloading https://files.pythonhosted.org/packages/62/19/06853e9285a668bf515a68e3de23a7a8aa3f90a71b15c507744fd85dd65c/pyarrow-0.14.0-cp36-cp36m-win_amd64.whl (17.4MB)
Building wheels for collected packages: feather-format
  Building wheel for feather-format (setup.py): started
  Building wheel for feather-format (setup.py): finished with status 'done'
  Created wheel for feather-format: filename=feather_format-0.4.0-cp36-none-any.whl size=4301 sha256=f62ff3cfe1eeb71c16dd09cc4de30acf5c70a322f9c076665321c6da169d8cc5
  Stored in directory: C:\Users\ch002\AppData\Local\pip\Cache\wheels\85\7d\12\2dfa5c0195f921ac935f5e8f27deada74972edc0ae9988a9c1
Successfully built feather-format

In [51]:
import feather
# Save the processed data as a .feather file
feather_file = "Data/ml-latest-small/ratings_processed.feather"
# feather.write_dataframe(ratings_df, feather_file)
ratings_df.to_feather(feather_file)

In [52]:
# Load .feather file as DataFrame
# ratings_df2 = feather.read_dataframe(feather_file)
ratings_df2 = pd.read_feather(feather_file)
ratings_df2.head()

Unnamed: 0,userId,movieId,rating,timestamp,Year,Month,Day,Hour,Minute,Second,DT,Quarter
0,1,1,4.0,964982703,2000,7,30,14.0,45.0,3.0,2000-07-30 14:45:03,3
1,1,3,4.0,964981247,2000,7,30,,,,2000-07-30 14:20:47,3
2,1,6,4.0,964982224,2000,7,30,,,,2000-07-30 14:37:04,3
3,1,47,5.0,964983815,2000,7,30,,,,2000-07-30 15:03:35,3
4,1,50,5.0,964982931,2000,7,30,,,,2000-07-30 14:48:51,3


In [54]:
import time
time.time()

1568665170.9880412

In [55]:
# Time the writing speed
import time
start = time.time()
ratings_df.to_csv(csv_file)
end = time.time()
print("Writing to a csv file costs:", (end - start))
print("File size:", os.path.getsize(csv_file))

start = time.time()
ratings_df.to_feather(feather_file)
end = time.time()
print("Writing to a feather file costs:", (end - start))
print("File size:", os.path.getsize(feather_file))

Writing to a csv file costs: 2.667367935180664
File size: 6601637
Writing to a feather file costs: 0.020022869110107422
File size: 9719192


In [60]:
# Exercise: Compare the loading time of csv and feather format

start = time.time()
ratings_df = pd.read_csv(csv_file,
                        delimiter=",")
end = time.time()
print(end-start)

start  = time.time()
ratings_df = pd.read_feather(feather_file)
end = time.time()
print(end - start)


0.3191647529602051
0.015650272369384766


## 4. pd.Series and pd.DataFrame

### Series

In [62]:
# Two important pandas data structure: Series (1D) and DataFrame (2D)
userId = ratings_df['userId']
# ?userId
# ?ratings_df

In [64]:
print(userId.values[:20])
print(userId.values[-20:])

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
[610 610 610 610 610 610 610 610 610 610 610 610 610 610 610 610 610 610
 610 610]


In [66]:
# Exercise: How many unique user IDs are there?
userId_set = set(userId)
print(len(userId_set))

610


In [67]:
# Index of userId
print(userId.index[:20])

RangeIndex(start=0, stop=20, step=1)


In [68]:
# Define a Series from scratch
series = pd.Series([4, 7, -5, 3],
                     index=['d', 'b', 'a', 'c'])
series

d    4
b    7
a   -5
c    3
dtype: int64

In [69]:
# slicing a Series
print("series['a']:")
print(series['a'])
print("\nseries[['a', 'b', 'c']]:")
print(series[['a', 'b', 'c']])
print("\nseries[series > 0]:")
print(series[series > 0])

series['a']:
-5

series[['a', 'b', 'c']]:
a   -5
b    7
c    3
dtype: int64

series[series > 0]:
d    4
b    7
c    3
dtype: int64


In [70]:
# Arithmetic operations
print("series * 2:")
print(series * 2)

series * 2:
d     8
b    14
a   -10
c     6
dtype: int64


In [71]:
# Define a Series from a dictionary
dictionary = {'Ohio': 35000,
              'Texas': 71000,
              'Oregon': 16000,
              'Utah': 5000}
series2 = pd.Series(dictionary)
series2

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [72]:
# Add values
series2['New York'] = 12345
series2

Ohio        35000
Oregon      16000
Texas       71000
Utah         5000
New York    12345
dtype: int64

In [73]:
# Remove values
series2.drop('New York', inplace=True)
series2

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

In [76]:
# without inplace=True
series3 = series2.drop("Ohio")
series3

Oregon    16000
Texas     71000
Utah       5000
dtype: int64

### DataFrame

In [None]:
# Define a DataFrame from scratch
df1 = pd.DataFrame(np.random.rand(5, 3),
                   columns=['Feature1', 'Feature2', 'Feature3'])
df1.head()

In [None]:
# Add values
df1.loc[1, 'Feature4'] = 1.234
df1

In [None]:
# Identify null values
df1.isnull()

In [None]:
# Cound number of missing values for each column
np.sum(df1.isnull(), axis=0)

In [None]:
# Remove rows
df1.drop([0, 1], axis=0, inplace=True)
df1

In [None]:
# Reset index
df1.reset_index(inplace=True)
df1

In [None]:
# Remove columns
df1.drop(['Feature4'], axis=1, inplace=True)
df1

In [None]:
# Exercise: Add one row to ratings_df and remove it.



## 5. Analyzing MovieLens Data

In [None]:
# Use value_counts() to find the rating frequencies
rating_frequencies = ratings_df['rating'].value_counts()
print(rating_frequencies)

In [None]:
# What are the rating frequencies in 2018?



In [None]:
# What are the rating frequencies in 2003?



In [None]:
# Do movies ratings inflate over time?



In [None]:
# How many movies were rated in 2003? in 2018?


In [None]:
# Do number of movies increase over time?



In [None]:
# How many ratings were submitted per year?



In [None]:
# How many users gave ratings per year?



In [None]:
# How consistent are the average ratings over time?



In [None]:
# Exercise: for each movie, find:
# 1. how many ratings were given to this movie
# 2. What is the highest and lowest rating
# 3. The frequency of each rating
# 4. When was the first rating given?
# 5. When was the last rating given?
# 6. What is the average rating?