In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


pd.set_option('display.float_format', lambda x: '%.4f' % x)

sns.set_theme(style='whitegrid')
grey = '#aca7a7'
cmap = sns.diverging_palette(10, 500, as_cmap=True)

In [2]:
df = pd.read_pickle('../data/watch_list_clean.pkl.xz')

In [3]:
anime = df[['title', 'num_eps', 'is_ongoing', 'duration', 'studio', 'start_year', 
            'end_year', 'season', 'avg_rating', 'num_votes', 'synopsis', 'tags', 'content_warnings', 
            'url']].drop_duplicates(['title', 'url'],
                                    ignore_index=True).sort_values('avg_rating', 
                                                                   ascending=False, 
                                                                   ignore_index=True).copy(deep=True)

In [7]:
df.columns

Index(['title', 'url', 'username', 'status', 'times_watched', 'user_rating',
       'num_eps', 'is_ongoing', 'duration', 'studio', 'start_year', 'end_year',
       'season', 'avg_rating', 'num_votes', 'synopsis', 'tags',
      dtype='object')

### `username` ~ `user_rating`

In [101]:
# username ~ user_rating pivot
pivot = df.pivot_table(index='username', 
                       values='user_rating',
                       aggfunc=['count', 'median', 
                                'mean', 'std', 'max', 'min']).droplevel(1, axis=1)

In [102]:
pivot.query('count > 10').sort_values('count', ascending=False).head(10)

Unnamed: 0_level_0,count,median,mean,std,max,min
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Harrt,4046,3.0,3.0602,1.1144,5.0,0.5
SionBlade,3997,3.0,3.0878,0.6757,5.0,1.0
Harry5676,3707,3.0,2.7996,1.3683,5.0,0.5
TRUEGOD,3676,5.0,5.0,0.0,5.0,5.0
MateusCarvalho590,3575,5.0,4.9773,0.2361,5.0,1.0
MrPaulG2,3206,3.0,3.1338,0.6418,5.0,0.5
HeavenSloth,3039,3.0,3.0905,0.9323,5.0,0.5
Twistpop,2747,3.5,3.3981,0.6835,5.0,0.5
tennis101101,2630,2.0,2.0036,1.3029,5.0,0.5
Katherine609,2626,3.5,3.5133,1.0447,5.0,0.5


In [103]:
pivot.query('count > 10').sort_values(['mean', 'count'], ascending=False).head(10)

Unnamed: 0_level_0,count,median,mean,std,max,min
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
TRUEGOD,3676,5.0,5.0,0.0,5.0,5.0
negi5845826,816,5.0,5.0,0.0,5.0,5.0
JANITOR04,791,5.0,5.0,0.0,5.0,5.0
tony,781,5.0,5.0,0.0,5.0,5.0
Abdullah5122,753,5.0,5.0,0.0,5.0,5.0
Jimushi,707,5.0,5.0,0.0,5.0,5.0
soulbrazier,668,5.0,5.0,0.0,5.0,5.0
CutManRules,624,5.0,5.0,0.0,5.0,5.0
Redemptiion,555,5.0,5.0,0.0,5.0,5.0
HeartSeed,536,5.0,5.0,0.0,5.0,5.0


Remove the user with username of *TRUEGOD* as they seem to have rated every title (3676 of them) as a 5/5.

### `username` ~ `times_watched`

In [122]:
# username ~ user_rating pivot
pivot = df.pivot_table(index='username', 
                       values='times_watched',
                       aggfunc=['count', 'median', 
                                'mean', 'std', 'max', 'min']).droplevel(1, axis=1)

In [124]:
pivot.sort_values('max', ascending=False)

Unnamed: 0_level_0,count,median,mean,std,max,min
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
MarikuIshtaru,1393,1.0000,1.0797,4.8324,181,0
Pejton,489,1.0000,1.1554,5.0319,112,0
LordDagz69,34,99.0000,78.6765,40.5154,99,0
Fairouz2001,11,1.0000,45.4545,51.0752,99,1
abdu6969,31,50.0000,52.7742,32.2973,99,0
...,...,...,...,...,...,...
Thiaron,1,0.0000,0.0000,,0,0
Sythe06,1,0.0000,0.0000,,0,0
Mizqu,1,0.0000,0.0000,,0,0
somewhatsurreal,2,0.0000,0.0000,0.0000,0,0


### `times_watched` ~ `user_rating`

In [121]:
df['times_watched'].corr(df['user_rating'])

0.07280553071580671

There is a weak, positive correlation between the two variables.