In [92]:
import math
import pandas as pd
import numpy as np
from scipy import stats
from matplotlib.pylab import plt
import seaborn as sns

In [93]:
movies_df = pd.read_csv("movies_with_franchises.csv", index_col = 0)
games_df = pd.read_csv("clean_games_with_franchises.csv", index_col = 0)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Naming convention to standardise the column names

In [94]:
games_df = games_df.drop("titleType", axis = 1)
games_df = games_df.rename(columns = {"rating": "age_rating", "startYear": "year", "primaryTitle": "title", "averageRating": "rating", "numVotes": "votes", "isAdult" : "is_adult"})
movies_df = movies_df.rename(columns={"startYear": "year", "primaryTitle": "title", "isAdult" : "is_adult", "runtimeMinutes": "runtime", "averageRating": "rating", "numVotes": "votes"})
movies_df = movies_df[["franchise_name", "franchise_id", "tconst", "title", "year", "is_adult", "runtime", "genres", "rating", "votes"]]

In [95]:
games_df.sample()

Unnamed: 0,franchise_name,franchise_id,tconst,title,year,is_adult,rating,votes,game_id,best_fit_ratio,genres,developers,platforms,publishers,age_rating
1099,Blade,3025-2228,tt0280488,Blade,2000,0,6.1,89.0,2328.0,100.0,"['Adventure', 'Action', 'Shooter', 'Crime']",['Zero Gravity Entertainment'],"['PlayStation', 'Dreamcast', 'PC']",['Marvel Studios'],


In [96]:
movies_df.sample()

Unnamed: 0,franchise_name,franchise_id,tconst,title,year,is_adult,runtime,genres,rating,votes
232577,,,tt3214248,all relative,2014,0,85,"Comedy,Drama,Romance",5.4,1631.0


- Normalize movies and games 
- Calculate z-scores
- Add the following headings\
    &emsp;"Time until next movie"\
    &emsp;"Difference in rating"
    
    &emsp;"Number in Franchise"

Get number of movies/games in franchise

In [97]:
movies_df['franchise_count'] = movies_df['franchise_id'].map(movies_df['franchise_id'].value_counts())
games_df['franchise_count'] = games_df['franchise_id'].map(games_df['franchise_id'].value_counts())
movies_df.sample()

Unnamed: 0,franchise_name,franchise_id,tconst,title,year,is_adult,runtime,genres,rating,votes,franchise_count
53094,,,tt0088082,conquest,1982,0,142,"Crime,Drama,Thriller",6.0,41.0,


Get an overview of how many franchises have x amount of movies / games

In [98]:
movies_df["franchise_id"].value_counts().value_counts().sort_index()

1      122
2      135
3      276
4      158
5       93
6       69
7       54
8       47
9       34
10      17
11      21
12      18
13      13
14       9
15       8
16       8
17       6
18       3
19       4
20       4
21       7
22       6
23       4
24       4
25       3
26       2
27       3
28       4
29       5
30       1
32       4
33       2
34       2
35       2
37       2
38       1
40       1
42       3
44       1
45       1
49       1
57       1
59       1
61       1
62       2
68       1
80       1
82       1
95       1
127      1
148      1
Name: franchise_id, dtype: int64

In [99]:
games_df["franchise_id"].value_counts().value_counts().sort_index()

1      489
2      427
3      328
4      199
5      129
6      123
7       63
8       47
9       35
10      26
11      23
12      22
13      11
14      18
15      14
16      11
17       6
18       7
19       9
20      10
21       8
22       5
23       7
24       1
25       2
26       1
27       3
28       3
29       2
30       1
31       3
33       1
34       3
35       5
36       2
37       4
38       1
40       1
41       1
45       1
46       1
48       1
49       2
50       1
56       2
58       1
60       1
61       1
79       2
80       1
82       1
84       1
88       1
106      1
121      1
172      1
195      1
216      1
Name: franchise_id, dtype: int64

Get total number of franchises:

In [100]:
movies_df.franchise_id.nunique()

1169

In [101]:
games_df.franchise_id.nunique()

2073

Remove any movies or games where the total franchise count after matching the series is less than 3

In [102]:
movies_df[["franchise_id", "franchise_name", "franchise_count"]] = movies_df[["franchise_id", "franchise_name", "franchise_count"]].apply(lambda x: np.nan if x["franchise_count"] < 3 else x, axis=1).sort_values("franchise_count")
games_df[["franchise_id", "franchise_name", "franchise_count"]] = games_df[["franchise_id", "franchise_name", "franchise_count"]].apply(lambda x: np.nan if x["franchise_count"] < 3 else x, axis=1).sort_values("franchise_count")

Get total number of franchises with more than 3:

In [103]:
movies_df.franchise_id.nunique()

912

In [104]:
games_df.franchise_id.nunique()

1157

Remove non-numerical years from data and convert to int

In [105]:
movies_df = movies_df[~(movies_df["year"] == "\\N")]
games_df = games_df[~(games_df["year"] == "\\N")]
movies_df["year"] = movies_df["year"].apply(lambda x: int(x))
games_df["year"] = games_df["year"].apply(lambda x: int(x))

Create additional columns for the datasets containing average rating for the franchises

In [107]:
av_vote_year = movies_df.groupby(['franchise_id', 'year'])['rating'].mean().reset_index(drop=False)
movies_df = movies_df.merge(av_vote_year[['franchise_id', 'year', 'rating']], on=['franchise_id', 'year'], how = "outer", suffixes=(None, '_franchise_average'))
av_vote_year = games_df.groupby(['franchise_id', 'year'])['rating'].mean().reset_index(drop=False)
games_df = games_df.merge(av_vote_year[['franchise_id', 'year', 'rating']], on=['franchise_id', 'year'], how = "outer", suffixes=(None, '_franchise_average'))


## Creating a modified dataset, containing merged years

In [109]:
grouped_movies_df = movies_df.copy(deep=True)

In [117]:
grouped_movies_df['franchise_position_by_year'] = grouped_movies_df.groupby(["franchise_id", "year"])["year"].rank()

In [118]:
grouped_movies_df.sort_values(["franchise_id", "year"])

Unnamed: 0,franchise_name,franchise_id,tconst,title,year,is_adult,runtime,genres,rating,votes,franchise_count,rating_franchise_average,franchise_position_by_year
259208,The Aldrich Family,f0,tt0032123,what a life,1939,0,75,"Comedy,Drama",6.9,93.0,11.0,6.9,1.0
259209,The Aldrich Family,f0,tt0033834,life with henry,1940,0,80,"Comedy,Family,Music",6.0,49.0,11.0,6.0,1.0
259210,The Aldrich Family,f0,tt0033708,henry aldrich for president,1941,0,75,"Comedy,Family",6.6,146.0,11.0,6.6,1.0
259211,The Aldrich Family,f0,tt0034842,"henry aldrich, editor",1942,0,72,"Comedy,Drama,Family",6.4,150.0,11.0,6.8,1.5
259212,The Aldrich Family,f0,tt0034844,henry and dizzy,1942,0,71,"Comedy,Family",7.2,58.0,11.0,6.8,1.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...
50319,,,tt9604202,mandala: the ufo incident,2023,0,\N,"Drama,Sci-Fi",8.1,9.0,,,282298.5
50320,,,tt9753314,the shepherd's supper: to deipno tou voskou,2023,0,\N,Drama,6.6,558.0,,,282298.5
50321,,,tt9755806,big shark,2023,0,\N,Horror,8.2,88.0,,,282298.5
50322,,,tt9861230,the quiet migration,2023,0,102,Drama,7.0,35.0,,,282298.5


## More code

In [36]:
movies_df = movies_df.sort_values(['franchise_id', 'year'])
movies_df['Year Gap'] = movies_df.groupby('franchise_id')['year'].diff()
movies_df['Rating_Difference'] = movies_df.groupby('franchise_id')['rating'].diff()
movies_df['Votes_Difference'] = movies_df.groupby('franchise_id')['votes'].diff()

In [41]:
#movies_df['AAAAHH'] = movies_df.groupby(['franchise_id','year'])['year'].diff()
movies_df['year_diff'] = movies_df.groupby(['franchise_id', 'year'])['year'].diff().reset_index(drop=True)

In [43]:
movies_df['year_difference'] = movies_df.groupby(['franchise_id', 'year'])['year'].diff()

# exclude same year differences
movies_df.loc[movies_df['year_difference'] == 0, 'year_difference'] = pd.NA

In [None]:
grouped_movies = 

In [44]:
movies_df.head(20)

Unnamed: 0,franchise_name,franchise_id,tconst,title,year,is_adult,runtime,genres,rating,votes,franchise_count,rating_franchise_average,Year Gap,Rating_Difference,Votes_Difference,AAAAHH,year_diff,year_difference
4822,The Aldrich Family,f0,tt0032123,what a life,1939,0,75,"Comedy,Drama",6.9,93.0,11.0,6.9,,,,,,
4823,The Aldrich Family,f0,tt0033834,life with henry,1940,0,80,"Comedy,Family,Music",6.0,49.0,11.0,6.0,1.0,-0.9,-44.0,,,
4824,The Aldrich Family,f0,tt0033708,henry aldrich for president,1941,0,75,"Comedy,Family",6.6,146.0,11.0,6.6,1.0,0.6,97.0,,,
4825,The Aldrich Family,f0,tt0034842,"henry aldrich, editor",1942,0,72,"Comedy,Drama,Family",6.4,150.0,11.0,6.8,1.0,-0.2,4.0,,,
4826,The Aldrich Family,f0,tt0034844,henry and dizzy,1942,0,71,"Comedy,Family",7.2,58.0,11.0,6.8,0.0,0.8,-92.0,0.0,,
4827,The Aldrich Family,f0,tt0035985,henry aldrich swings it,1943,0,64,"Comedy,Family",8.3,110.0,11.0,7.0,1.0,1.1,52.0,,,
4828,The Aldrich Family,f0,tt0035983,henry aldrich gets glamour,1943,0,72,"Adventure,Comedy,Family",6.4,130.0,11.0,7.0,0.0,-1.9,20.0,0.0,,
4829,The Aldrich Family,f0,tt0035984,henry aldrich haunts a house,1943,0,73,"Adventure,Comedy,Family",6.3,151.0,11.0,7.0,0.0,-0.1,21.0,0.0,,
4830,The Aldrich Family,f0,tt0036909,"henry aldrich, boy scout",1944,0,66,"Comedy,Family",6.5,129.0,11.0,7.7,1.0,0.2,-22.0,,,
4831,The Aldrich Family,f0,tt0036907,henry aldrich plays cupid,1944,0,65,"Comedy,Family",8.2,100.0,11.0,7.7,0.0,1.7,-29.0,0.0,,


In [None]:
movies_df = movies_df.sort_values(['franchise_id', 'startYear'])
movies_df['Rating_Difference'] = movies_df.groupby('franchise_id')['averageRating'].diff()

In [None]:
movies_df = movies_df.sort_values(['franchise_id', 'startYear'])
movies_df['Votes_Difference'] = movies_df.groupby('franchise_id')['numVotes'].diff()

In [None]:
movies_df['franchise_position'] = movies_df.groupby("franchise_id")["startYear"].rank(method="first")

In [None]:
movies_df3

In [None]:
movies_df.groupby(['franchise_id', 'startYear']).size()

In [None]:
movies_df

In [None]:
movies_df.head(50)

In [None]:
movies_df["Year Gap"].value_counts().plot()

# Notes
Using value_counts we discover 25% of our movies for the franchises are released in the same year. So we have to either get an average per year (in this case, we are looking at fatique over years as opposed to number of movies)

In [None]:
movies_df[movies_df["franchise_id"] == "f1"].sort_values("startYear")

In [None]:
cols = ["tconst", "averageRating", "numVotes", "startYear"]
#new_cols = ["tconst", "rating", "votes", "year"]

games_scores = games_df[cols]

movies_scores = movies_df[cols]

In [None]:
%matplotlib inline

sns.set_context("talk", font_scale=.9)   
sns.set_context('talk')

fig_width, fig_height = 4, 3

In [None]:
fig, ax = plt.subplots(figsize=(fig_width*3, fig_height*2))

interval = 0.01
((movies_scores['averageRating']/10)).hist(
    bins=25, ax=ax, grid=False, color="red", density=True, alpha=.4)
((games_scores['averageRating']/10)).hist(
    bins=25, ax=ax, grid=False, color="blue", density=True, alpha=.4)


ax.set_ylim(0, 5)
ax.set_xlim(0, 1)

ax.legend(['Movies', 'Games'], ncol=2, loc='upper left')

mean_movie_rating = (movies_scores['averageRating']/10).mean()
mean_game_rating = (games_scores['averageRating']/10).mean()

ax.axvline(mean_movie_rating, color="red")
ax.axvline(mean_game_rating, color="blue")

ax.text(mean_movie_rating, 6, '[1]', ha='center',color="red", backgroundcolor='w', fontsize=14)

ax.text(mean_game_rating, 6, '[2]', ha='center',color="blue", backgroundcolor='w', fontsize=14)


ax.text(
    0, 2,
    '''
    [1] Mean Movie Rating (All Movies, |)
    
    [2] Mean Game Rating (All Games, |)
    ''',
    fontsize=14
)

ax.set_xlabel('Movie/Game Rating (Normed)')
ax.set_ylabel('Relative Number of Movies/Games')

ax.set_title('{:,} Movies, {:,} Games'.format(len(movies_scores), len(games_scores)))

fig.tight_layout()

#fig.savefig('../../graphs/2400_fig_1_normed_rating_distributions.png', format='png', dpi=300)

In [None]:
movies_df['averageRating'].agg(['mean', 'std'])

In [None]:
movies_df['averageRating']

In [None]:
(movies_df['averageRating']-movies_df['averageRating'].mean())/movies_df['averageRating'].std()

In [None]:
stats.zscore(movies_df['averageRating'])

In [None]:
stats.zscore(movies_df['averageRating']).mean()

In [None]:
movies_df['z_rating_for_movie'] = stats.zscore(movies_df['averageRating'])

games_df['z_rating_for_game'] = stats.zscore(games_df['averageRating'])

fig, ax = plt.subplots(figsize=(fig_width*3, fig_height*2))


((movies_df['z_rating_for_movie'])).hist(
    bins=50, ax=ax, grid=False, color="red", density=True, alpha=.4)
((games_df['z_rating_for_game'])).hist(
    bins=50, ax=ax, grid=False, color="blue", density=True, alpha=.4)


ax.set_xlim(-5, 5)

ax.legend(['Movies', 'Games'], ncol=2, loc='upper left')

ax.axvline((movies_df['z_rating_for_movie']).mean(), lw=2, c="red")
ax.axvline((games_df['z_rating_for_game']).mean(), lw=2, c="blue")


ax.set_xlabel('Movie/Games Rating (Z-Scores)')
ax.set_ylabel('Relative Number of Movies/Games')
ax.set_title('{:,} Movies, {:,} Games'.format(len(movies_df), len(games_df)))

fig.tight_layout()

#fig.savefig('../../graphs/2400_fig_2_z_rating_distributions.png', format='png', dpi=300)

In [52]:
movies_df.count()

franchise_name              7481
franchise_id                7481
tconst                      7481
title                       7481
year                        7481
is_adult                    7481
runtime                     7481
genres                      7481
rating                      7481
votes                       7481
franchise_count             7481
rating_franchise_average    7481
Year Gap                    6569
Rating_Difference           6569
Votes_Difference            6569
AAAAHH                      1630
year_diff                   1630
year_difference                0
dtype: int64

In [89]:
sample_df = movies_df.sample(10000)
grouped_df = sample_df.groupby('franchise_id').apply(sample_func)

In [91]:
grouped_df

franchise_id           
f1            tt0071479    1
              tt0077517    2
f102          tt0040432    1
f103          tt0121864    1
f1041         tt0077737    1
                          ..
f97           tt0235005    1
f970          tt9732250    1
f977          tt2386490    1
f98           tt0039210    1
f987          tt0369060    1
Length: 279, dtype: int64

In [None]:
franchise_df = grouped_df[grouped_df['franchise_id'] == 'f1334']

In [88]:
def sample_func(sub_df):
    
    movies_in_order = sub_df.sort_values('year')['tconst']
    return pd.Series({movie: k+1 for k, movie in enumerate(movies_in_order)})

In [None]:
sample_func(franchise_df)

In [None]:
l = ['a','b','c']
for k,i in enumerate(l):
    print(k,i)
    print(i,  '-->',  k+1)

In [None]:
franchise_df.sort_values('startYear')['startYear']

In [None]:
franchise_df.set_index('tconst').sort_values('startYear')['startYear'].map(int).diff()