# Day 5: Visualization

In [None]:
from pathlib import Path

import pandas as pd

In [72]:
path = Path('data/movielens/')

# reading
movies = pd.read_csv(path / 'movies.csv', index_col='movieId')
links = pd.read_csv(path / 'links.csv', index_col='movieId')
movies = movies.merge(links, how='inner', left_index=True, right_index=True)

# extracting year
pattern = r'\((?P<year>\d+)\)\s?$'
movies['year'] = movies['title'].str.extract(pattern)

# casting
movies['title'] = movies['title'].astype('string')
movies['genres'] = movies['genres'].astype('string')
movies['imdbId'] = movies['imdbId'].astype('int32')
movies['tmdbId'] = movies['tmdbId'].astype('Int32')
movies['year'] = movies['year'].astype('Int32')
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9742 entries, 1 to 193609
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   9742 non-null   string
 1   genres  9742 non-null   string
 2   imdbId  9742 non-null   int32 
 3   tmdbId  9734 non-null   Int32 
 4   year    9729 non-null   Int32 
dtypes: Int32(2), int32(1), string(2)
memory usage: 619.6 KB


## Descriptive Statistic

In [None]:
movies.describe()

## Time Series

In [None]:
from pathlib import Path

ratings = pd.read_csv('data/movielens/ratings.csv')
ratings.info() # 2022-04-28

In [None]:
ratings['parsed_time'] = pd.to_datetime(ratings['timestamp'], unit='s')
ratings.info()

### Filters

In [None]:
ratings['parsed_time'] > '2015-01-01'
ratings['parsed_time'].dt.year == 2017


In [None]:
# triedenie podla vyparsovaneho datumu a casu
ratings.sort_values(by='parsed_time', ascending=True)

### Task/Lab

1. Zistite, aké bolo priemerné hodnotenie filmu *Titanic* od *James-a Cameroon-a* o rok neskôr po ovedení tohto filmu do kín.

In [None]:
# titanic od Camerona
is_titanic = movies['title'].str.startswith('Titanic') & (movies['year'] == 1997)

df = movies.loc[ is_titanic ].merge(ratings, how='inner', on='movieId')
df.loc[ df['parsed_time'].dt.year == 1998, 'rating' ].mean()
#.sort_values(by='parsed_time', ascending=True)

In [None]:
# filter
cameroons_titanic = movies['title'].str.startswith('Titanic') & (movies['year'] == 1997)

# query
titanic_ratings = movies.loc[ cameroons_titanic ].merge(ratings, how='inner', on='movieId')  # .sort_values(by='parsed_time', ascending=True)
titanic_ratings.loc[ titanic_ratings['parsed_time'].dt.year == 1998, 'rating' ].mean()

## Visualization

In [None]:
#creating a DataFrame
import numpy as np
df = pd.DataFrame(np.random.rand(10, 4), 
                  columns=('col_1', 'col_2', 'col_3', 'col_4'))
df

### Line Graph

In [None]:
df.plot()

In [None]:
df.plot(x='col_1', y='col_2')

In [None]:
df.plot(subplots=True, figsize=(8, 8));

### Bar Graph

In [None]:
df.plot(kind="bar")

In [None]:
df.plot.bar(stacked=True);

In [None]:
df.plot.barh(stacked=True);

### Box Plot

In [None]:
df.plot.box()

### Area Plot

In [None]:
df.plot.area()

## Indiana Jones

|            | Indy 1 | Indy 2 | Indy 3 | Indy 4 |
|------------|--------|--------|--------|--------|
| 2000-01-01 |   5    |   4    |   3    |   1    |
| 2000-01-02 |   4    |   3    |   2    |   2    |