In [6]:
import pandas as pd 
import math
import scipy.stats as st
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x : '%.5f' % x)


df = pd.read_csv("movies_metadata.csv", low_memory = False)

df = df[["title","vote_average","vote_count"]]

In [7]:
df.shape

(45466, 3)

In [8]:
df.head()

Unnamed: 0,title,vote_average,vote_count
0,Toy Story,7.7,5415.0
1,Jumanji,6.9,2413.0
2,Grumpier Old Men,6.5,92.0
3,Waiting to Exhale,6.1,34.0
4,Father of the Bride Part II,5.7,173.0


In [9]:
## Vote_Averagelara göre sıralama
df.sort_values("vote_average", ascending = False).head(20)


Unnamed: 0,title,vote_average,vote_count
21642,Ice Age Columbus: Who Were the First Americans?,10.0,1.0
15710,If God Is Willing and da Creek Don't Rise,10.0,1.0
22396,Meat the Truth,10.0,1.0
22395,Marvin Hamlisch: What He Did For Love,10.0,1.0
35343,Elaine Stritch: At Liberty,10.0,1.0
186,Reckless,10.0,1.0
45047,The Human Surge,10.0,1.0
22377,The Guide,10.0,1.0
22346,هیچ کجا هیچ کس,10.0,1.0
1634,Other Voices Other Rooms,10.0,1.0


In [10]:
#Böyle sıralama yapıldığında vote_count'ı göz ardı ediliyor ve sonuç yanlış ve taraflı oluyor. 
# Yapılacak ilk hamle, vote_count ve vote_average değerlerini aynı aralıkta standardize etmek. 


In [11]:
df["vote_count_score"] = MinMaxScaler(feature_range = (1,10)).fit(df[["vote_count"]]).transform(df[["vote_count"]])

In [12]:
df.head()

Unnamed: 0,title,vote_average,vote_count,vote_count_score
0,Toy Story,7.7,5415.0,4.46252
1,Jumanji,6.9,2413.0,2.54295
2,Grumpier Old Men,6.5,92.0,1.05883
3,Waiting to Exhale,6.1,34.0,1.02174
4,Father of the Bride Part II,5.7,173.0,1.11062


In [13]:
#Vote avg ve vote countscore ilişkilendirilecek 
df["average_count_score"] = df["vote_average"]*df["vote_count_score"]


In [14]:
df.sort_values("average_count_score", ascending = False).head(20)

Unnamed: 0,title,vote_average,vote_count,vote_count_score,average_count_score
15480,Inception,8.1,14075.0,10.0,81.0
12481,The Dark Knight,8.3,12269.0,8.84519,73.41505
22879,Interstellar,8.1,11187.0,8.15332,66.0419
17818,The Avengers,7.4,12000.0,8.67318,64.18153
14551,Avatar,7.2,12114.0,8.74607,62.97174
26564,Deadpool,7.4,11444.0,8.31766,61.55065
2843,Fight Club,8.3,9678.0,7.18842,59.66388
20051,Django Unchained,7.8,10297.0,7.58423,59.15697
23753,Guardians of the Galaxy,7.9,10014.0,7.40327,58.48582
292,Pulp Fiction,8.3,8670.0,6.54387,54.31414


In [16]:
#IMDB Ağırlıklı Derecelendirme 
#IMDB Weighted Rating 

#weighted_rating = (v/(v+M)*r) + (M/(v+M)*C)

# r = rate average
# v = vote count
# M = minimum votes required to be listed in the Top 250 
# C = the mean vote across the whole report  (Currently 7.0)


In [17]:
M = 2500
C = df['vote_average'].mean()

def weighted_rating(r,v,M,C):
    return (v/(v+M)*r + (M/(v+M)*C))

In [18]:
df.sort_values("average_count_score", ascending = False).head(20)

Unnamed: 0,title,vote_average,vote_count,vote_count_score,average_count_score
15480,Inception,8.1,14075.0,10.0,81.0
12481,The Dark Knight,8.3,12269.0,8.84519,73.41505
22879,Interstellar,8.1,11187.0,8.15332,66.0419
17818,The Avengers,7.4,12000.0,8.67318,64.18153
14551,Avatar,7.2,12114.0,8.74607,62.97174
26564,Deadpool,7.4,11444.0,8.31766,61.55065
2843,Fight Club,8.3,9678.0,7.18842,59.66388
20051,Django Unchained,7.8,10297.0,7.58423,59.15697
23753,Guardians of the Galaxy,7.9,10014.0,7.40327,58.48582
292,Pulp Fiction,8.3,8670.0,6.54387,54.31414


In [20]:
weighted_rating(7.40000, 11444.00000,M,C) #Deadpool filmi 

7.080544896574546

In [22]:
weighted_rating(8.10000,14075.00000,M,C) #Inception

7.725672279809078

In [23]:
weighted_rating(8.50000,8358.00000,M,C) #Esaretin Bedeli, The Shawshank Redemption

7.83648167598411

In [24]:
df["weighted_rating"] = weighted_rating(df["vote_average"],df["vote_count"],M,C)

In [25]:
df.head()

Unnamed: 0,title,vote_average,vote_count,vote_count_score,average_count_score,weighted_rating
0,Toy Story,7.7,5415.0,4.46252,34.36142,7.04245
1,Jumanji,6.9,2413.0,2.54295,17.54634,6.24775
2,Grumpier Old Men,6.5,92.0,1.05883,6.88238,5.64951
3,Waiting to Exhale,6.1,34.0,1.02174,6.23262,5.62467
4,Father of the Bride Part II,5.7,173.0,1.11062,6.33054,5.6235


In [26]:
df.sort_values("weighted_rating",ascending = False).head(20)

Unnamed: 0,title,vote_average,vote_count,vote_count_score,average_count_score,weighted_rating
12481,The Dark Knight,8.3,12269.0,8.84519,73.41505,7.84604
314,The Shawshank Redemption,8.5,8358.0,6.34437,53.92714,7.83648
2843,Fight Club,8.3,9678.0,7.18842,59.66388,7.74946
15480,Inception,8.1,14075.0,10.0,81.0,7.72567
292,Pulp Fiction,8.3,8670.0,6.54387,54.31414,7.69978
834,The Godfather,8.5,6024.0,4.85194,41.24146,7.6548
22879,Interstellar,8.1,11187.0,8.15332,66.0419,7.64669
351,Forrest Gump,8.2,8147.0,6.20945,50.91748,7.59377
7000,The Lord of the Rings: The Return of the King,8.1,8226.0,6.25996,50.70571,7.52155
4863,The Lord of the Rings: The Fellowship of the Ring,8.0,8892.0,6.68583,53.48661,7.47731


In [27]:
#Bayesian Average Rating Score#
def bayesian_average_rating(n,confidence = 0.95):
    if sum(n) == 0:
        return 0
    K = len(n)
    z = st.norm.ppf(1-(1-confidence) / 2)
    N = sum(n)
    first_part = 0.0
    second_part = 0.0 
    for k, n_k in enumerate(n):
        first_part += (k+1) * (n[k] +1) / (N+K)
        second_part += (k+1)*(k+1)*(n[k] + 1) / (N+K)
    score = first_part- z*math.sqrt((second_part-first_part * first_part) / (N+ K + 1))
    return score


In [29]:
#Shawshank Redemption score calculating 
#it takes argument as star count of ratings.
bayesian_average_rating([34733,4355,4704,6561,13515,26183,87368,273082,600260,1295351])

9.14538444560111

In [31]:
df = pd.read_csv("imdb_ratings.csv")
df = df.iloc[0:, 1:]

In [32]:
df.head()

Unnamed: 0,id,movieName,rating,ten,nine,eight,seven,six,five,four,three,two,one
0,111161,1. The Shawshank Redemption (1994),9.2,1295382,600284,273091,87368,26184,13515,6561,4704,4355,34733
1,68646,2. The Godfather (1972),9.1,837932,402527,199440,78541,30016,16603,8419,6268,5879,37128
2,71562,3. The Godfather: Part II (1974),9.0,486356,324905,175507,70847,26349,12657,6210,4347,3892,20469
3,468569,4. The Dark Knight (2008),9.0,1034863,649123,354610,137748,49483,23237,11429,8082,7173,30345
4,50083,5. 12 Angry Men (1957),8.9,246765,225437,133998,48341,15773,6278,2866,1723,1478,8318


In [37]:
df["bar_score"] = df.apply(lambda x: bayesian_average_rating(x[["one","two","three","four","five","six","seven","eight","nine","ten"]]),axis=1)

In [38]:
df.sort_values("bar_score", ascending = False).head(20)

Unnamed: 0,id,movieName,rating,ten,nine,eight,seven,six,five,four,three,two,one,bar_score
0,111161,1. The Shawshank Redemption (1994),9.2,1295382,600284,273091,87368,26184,13515,6561,4704,4355,34733,9.14539
1,68646,2. The Godfather (1972),9.1,837932,402527,199440,78541,30016,16603,8419,6268,5879,37128,8.94002
3,468569,4. The Dark Knight (2008),9.0,1034863,649123,354610,137748,49483,23237,11429,8082,7173,30345,8.89596
2,71562,3. The Godfather: Part II (1974),9.0,486356,324905,175507,70847,26349,12657,6210,4347,3892,20469,8.8125
4,50083,5. 12 Angry Men (1957),8.9,246765,225437,133998,48341,15773,6278,2866,1723,1478,8318,8.76793
6,167260,7. The Lord of the Rings: The Return of ...,8.9,703093,433087,270113,117411,44760,21818,10873,7987,6554,28990,8.75204
5,108052,6. Schindler's List (1993),8.9,453906,383584,220586,82367,27219,12922,6234,4572,4289,19328,8.74361
11,109830,12. Forrest Gump (1994),8.8,622104,553654,373644,151284,51140,22720,11692,7647,5941,12110,8.69915
12,1375666,13. Inception (2010),8.7,724798,627987,408686,174229,60668,26910,13436,8703,6932,17621,8.69315
10,137523,11. Fight Club (1999),8.8,637087,572654,371752,152295,53059,24755,12648,8606,6948,17435,8.67448


In [39]:
# Weighted Average Ratings
# IMDb publishes weighted vote averages rather than raw data averages.
# The simplest way to explain it is that although we accept and consider all votes received by users,
# not all votes have the same impact (or ‘weight’) on the final rating.

# When unusual voting activity is detected,
# an alternate weighting calculation may be applied in order to preserve the reliability of our system.
# To ensure that our rating mechanism remains effective,
# we do not disclose the exact method used to generate the rating.
#
# See also the complete FAQ for IMDb ratings.

In [40]:
weighted_rating(7.40, 12000, M, C)

7.092794347436929

In [41]:
weighted_rating(8.10, 14075, M, C)

7.725672279809078

In [43]:
weighted_rating(8.10, 14075, M, C)

7.725672279809078

In [44]:
weighted_rating(8.50, 8358, M, C)

7.83648167598411