# Day 5: Visualization

In [3]:
from pathlib import Path

import pandas as pd

In [24]:
path = Path('data/movielens/')

# reading
movies = pd.read_csv(path / 'movies.csv', index_col='movieId')
links = pd.read_csv(path / 'links.csv', index_col='movieId')
movies = movies.merge(links, how='inner', left_index=True, right_index=True)

# extracting year
pattern = r'\((?P<year>\d+)\)\s?$'
movies['year'] = movies['title'].str.extract(pattern)

# casting
movies['title'] = movies['title'].astype('string')
movies['genres'] = movies['genres'].astype('string')
movies['imdbId'] = movies['imdbId'].astype('int32')
movies['tmdbId'] = movies['tmdbId'].astype('Int32')
movies['year'] = movies['year'].astype('Int32')
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 1 to 193609
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   9742 non-null   string
 1   genres  9742 non-null   string
 2   imdbId  9742 non-null   int32 
 3   tmdbId  9734 non-null   Int32 
 4   year    9729 non-null   Int32 
dtypes: Int32(2), int32(1), string(2)
memory usage: 619.6 KB


## Descriptive Statistic

In [5]:
movies.describe()

Unnamed: 0,imdbId,tmdbId
count,9742.0,9734.0
mean,677183.9,55162.123793
std,1107228.0,93653.481487
min,417.0,2.0
25%,95180.75,9665.5
50%,167260.5,16529.0
75%,805568.5,44205.75
max,8391976.0,525662.0


## Time Series

In [6]:
from pathlib import Path

ratings = pd.read_csv('data/movielens/ratings.csv')
ratings.info() # 2022-04-28

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [16]:
ratings['parsed_time'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   userId       100836 non-null  int64         
 1   movieId      100836 non-null  int64         
 2   rating       100836 non-null  float64       
 3   timestamp    100836 non-null  int64         
 4   parsed_time  100836 non-null  datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 3.8 MB


### Filters

In [15]:
ratings['parsed_time'] > '2015-01-01'
ratings['parsed_time'].dt.year == 2017


0         False
1         False
2         False
3         False
4         False
          ...  
100831     True
100832     True
100833     True
100834     True
100835     True
Name: parsed_time, Length: 100836, dtype: bool

In [18]:
# triedenie podla vyparsovaneho datumu a casu
ratings.sort_values(by='parsed_time', ascending=True)

Unnamed: 0,userId,movieId,rating,timestamp,parsed_time
66719,429,595,5.0,828124615,1996-03-29 18:36:55
66716,429,588,5.0,828124615,1996-03-29 18:36:55
66717,429,590,5.0,828124615,1996-03-29 18:36:55
66718,429,592,5.0,828124615,1996-03-29 18:36:55
66712,429,432,3.0,828124615,1996-03-29 18:36:55
...,...,...,...,...,...
81475,514,187031,2.5,1537674927,2018-09-23 03:55:27
81477,514,187595,3.0,1537674946,2018-09-23 03:55:46
81336,514,5247,2.5,1537757040,2018-09-24 02:44:00
81335,514,5246,1.5,1537757059,2018-09-24 02:44:19


### Task/Lab

1. Zistite, aké bolo priemerné hodnotenie filmu *Titanic* v roku uvedenia tohto filmu.

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,year,userId,rating,timestamp,parsed_time
