In [18]:
import numpy as np
import pandas as pd
from scipy.stats import mannwhitneyu, bootstrap, ttest_ind

In [81]:
data = pd.read_csv('kinopoisk_rating.csv', sep=';')

In [3]:
data.dtypes

num             int64
name_rus       object
rating_new    float64
origin         object
genre          object
rating_old    float64
qty_views       int64
dtype: object

In [4]:
len(data)

250

In [5]:
data.head()

Unnamed: 0,num,name_rus,rating_new,origin,genre,rating_old,qty_views
0,1,Зеленая миля,9.1,США,фэнтези/ драма,8.9,692418
1,2,Побег из Шоушенка,9.1,США,драма,8.9,784326
2,3,Властелин колец: Возвращение короля,8.6,Новая Зеландия/ США,фэнтези/ приключения,8.8,481829
3,4,Властелин колец: Две крепости,8.6,Новая Зеландия/ США,фэнтези/ приключения,8.8,467607
4,5,Властелин колец: Братство Кольца,8.6,Новая Зеландия/ США,фэнтези/ приключения,8.8,516856


In [6]:
data.isna().sum()

num           0
name_rus      0
rating_new    0
origin        0
genre         0
rating_old    0
qty_views     0
dtype: int64

In [7]:
r_new = data.rating_new.values
r_old = data.rating_old.values

### Mann-Whitney U-test (All)

In [57]:
alpha = 0.05

In [8]:
U1, p = mannwhitneyu(
    x=r_old,
    y=r_new,
    alternative='two-sided',
    method='exact'
)

In [15]:
1-p < alpha

True

In [16]:
1-p

0.036259213631644194

=> the p-value is low enough to reject the null hypothesis

### Bootstrap (All)

In [61]:
print('Old: ')
print(f'Mean: {np.mean(r_old)}')

Old: 
Mean: 8.179599999999999


In [62]:
print('New: ')
print(f'Mean: {np.mean(r_new)}')

New: 
Mean: 8.1844


In [47]:
r_all = np.concatenate((r_new, r_old))

In [58]:
res_mean = bootstrap(
    data=(r_old,),
    statistic=np.mean,
    n_resamples=1000,
    confidence_level=1-alpha
)

In [59]:
res_mean.confidence_interval

BootstrapResult(confidence_interval=ConfidenceInterval(low=8.158, high=8.204088524785561), standard_error=0.012010796817871847)

In [63]:
low = res_mean.confidence_interval.low
high = res_mean.confidence_interval.high

(low > np.mean(r_new)) or (np.mean(r_new) > high)

False

=> the mean of new rating lies within confidence interval, therefore we can not reject the null hypothesis

### Genres

In [82]:
len(data.genre.unique())

76

In [83]:
all_genres = list(data.genre.unique())

processed_genres = []

for x in all_genres:
    genres = x.split('/ ')
    processed_genres += genres
    
processed_genres = list(set(processed_genres))

In [84]:
len(processed_genres)

21

### Mann-Whitney U-test (Genre)

In [85]:
for genre in processed_genres:
    print(f'Genre: {genre}')
    subsample = data[data.genre.str.contains(genre)].copy()
    
    r_new_genre = subsample.rating_new.values
    r_old_genre = subsample.rating_old.values
    
    _, p_genre = mannwhitneyu(
        x=r_old_genre,
        y=r_new_genre,
        alternative='two-sided',
        method='auto' # too long to wait for exact sometimes
    )
    
    if 1-p_genre < alpha:
        print('Result: reject the null hypothesis')
    else:
        print('Result: cannot reject the null hypothesis')
    print(f'p-value is: {round(1-p_genre, 4)}')
    print('-----------------------------------------------')

Genre: детектив
Result: cannot reject the null hypothesis
p-value is: 0.7588
-----------------------------------------------
Genre: спорт
Result: cannot reject the null hypothesis
p-value is: 0.3577
-----------------------------------------------
Genre: комедия
Result: cannot reject the null hypothesis
p-value is: 0.7895
-----------------------------------------------
Genre: фэнтези
Result: cannot reject the null hypothesis
p-value is: 0.877
-----------------------------------------------
Genre: вестерн
Result: reject the null hypothesis
p-value is: 0.0
-----------------------------------------------
Genre: криминал
Result: cannot reject the null hypothesis
p-value is: 0.7212
-----------------------------------------------
Genre: биография
Result: cannot reject the null hypothesis
p-value is: 0.248
-----------------------------------------------
Genre: мюзикл
Result: cannot reject the null hypothesis
p-value is: 0.2511
-----------------------------------------------
Genre: фантастика
R

### Bootstrap (Genre)

In [86]:
for genre in processed_genres:
    print(f'Genre: {genre}')
    subsample = data[data.genre.str.contains(genre)].copy()
    
    r_new_genre = subsample.rating_new.values
    r_old_genre = subsample.rating_old.values
    
    try:
        res = bootstrap(
            data=(r_old_genre,),
            statistic=np.mean,
            n_resamples=1000,
            confidence_level=1-alpha
        )

        low = res.confidence_interval.low
        high = res.confidence_interval.high
        n_mean = np.mean(r_new_genre)

        result = (low > n_mean) or (n_mean > high)

        if result:
            print('Result: reject the null hypothesis. Mean is outside of the confidence interval')
        else:
            print('Result: cannot reject the null hypothesis. Mean is within the confidence interval')
        print(f'Confidence interval is: [{round(low, 4)}; {round(high, 4)}]')
        print(f'Mean of new ratings in {genre} is {round(n_mean, 4)}')
    
    except ValueError:
        print(f'Not enough observations!')
        
    print('-----------------------------------------------')

Genre: детектив
Result: cannot reject the null hypothesis. Mean is within the confidence interval
Confidence interval is: [8.1571; 8.4143]
Mean of new ratings in детектив is 8.4143
-----------------------------------------------
Genre: спорт
Result: cannot reject the null hypothesis. Mean is within the confidence interval
Confidence interval is: [8.0; 8.3]
Mean of new ratings in спорт is 8.15
-----------------------------------------------
Genre: комедия
Result: reject the null hypothesis. Mean is outside of the confidence interval
Confidence interval is: [8.16; 8.2775]
Mean of new ratings in комедия is 8.2925
-----------------------------------------------
Genre: фэнтези
Result: reject the null hypothesis. Mean is outside of the confidence interval
Confidence interval is: [8.1553; 8.3114]
Mean of new ratings in фэнтези is 8.1421
-----------------------------------------------
Genre: вестерн
Result: cannot reject the null hypothesis. Mean is within the confidence interval
Confidence in

### Origin

In [89]:
len(data.origin.unique())

45

In [96]:
all_origins = list(data.origin.unique())

processed_origins = []

for x in all_origins:
    origin = x.split('/ ')
    processed_origins += origin
    
processed_origins = list(set(processed_origins))

In [97]:
len(processed_origins)

27

### Mann-Whitney U-test (Origin)

In [98]:
for origin in processed_origins:
    print(f'Origin: {origin}')
    subsample = data[data.origin.str.contains(origin)].copy()
    
    r_new_origin = subsample.rating_new.values
    r_old_origin = subsample.rating_old.values
    
    try:
        _, p_origin = mannwhitneyu(
            x=r_old_origin,
            y=r_new_origin,
            alternative='two-sided',
            method='auto' # too long to wait for exact sometimes
        )

        if 1-p_origin < alpha:
            print('Result: reject the null hypothesis')
        else:
            print('Result: cannot reject the null hypothesis')
        print(f'p-value is: {round(1-p_origin, 4)}')
        
    except ValueError:
        print('Not enough observations!')
        
    print('-----------------------------------------------')

Origin: Люксембург
Result: reject the null hypothesis
p-value is: 0.0
-----------------------------------------------
Origin: Австралия
Result: cannot reject the null hypothesis
p-value is: 0.4934
-----------------------------------------------
Origin: Беларусь
Result: reject the null hypothesis
p-value is: 0.0
-----------------------------------------------
Origin: Мексика
Result: reject the null hypothesis
p-value is: 0.0
-----------------------------------------------
Origin: Италия
Result: cannot reject the null hypothesis
p-value is: 0.8903
-----------------------------------------------
Origin: Франция
Result: cannot reject the null hypothesis
p-value is: 0.147
-----------------------------------------------
Origin: Япония
Result: cannot reject the null hypothesis
p-value is: 0.3615
-----------------------------------------------
Origin: Китай
Result: reject the null hypothesis
p-value is: 0.0
-----------------------------------------------
Origin: Бельгия
Result: reject the null

### Bootstrap (Genre)

In [99]:
for origin in processed_origins:
    print(f'Origin: {origin}')
    subsample = data[data.origin.str.contains(origin)].copy()
    
    r_new_origin = subsample.rating_new.values
    r_old_origin = subsample.rating_old.values
    
    try:
        res = bootstrap(
            data=(r_old_origin,),
            statistic=np.mean,
            n_resamples=1000,
            confidence_level=1-alpha
        )

        low = res.confidence_interval.low
        high = res.confidence_interval.high
        n_mean = np.mean(r_new_origin)

        result = (low > n_mean) or (n_mean > high)

        if result:
            print('Result: reject the null hypothesis. Mean is outside of the confidence interval')
        else:
            print('Result: cannot reject the null hypothesis. Mean is within the confidence interval')
        print(f'Confidence interval is: [{round(low, 4)}; {round(high, 4)}]')
        print(f'Mean of new ratings in {origin} is {round(n_mean, 4)}')
    
    except ValueError:
        print(f'Not enough observations!')
        
    print('-----------------------------------------------')

Origin: Люксембург
Not enough observations!
-----------------------------------------------
Origin: Австралия
Result: cannot reject the null hypothesis. Mean is within the confidence interval
Confidence interval is: [8.0; 8.4667]
Mean of new ratings in Австралия is 8.1333
-----------------------------------------------
Origin: Беларусь
Not enough observations!
-----------------------------------------------
Origin: Мексика
Result: reject the null hypothesis. Mean is outside of the confidence interval
Confidence interval is: [8.0; 8.15]
Mean of new ratings in Мексика is 8.2
-----------------------------------------------
Origin: Италия
Result: reject the null hypothesis. Mean is outside of the confidence interval
Confidence interval is: [8.0; 8.1714]
Mean of new ratings in Италия is 8.2857
-----------------------------------------------
Origin: Франция
Result: cannot reject the null hypothesis. Mean is within the confidence interval
Confidence interval is: [8.0692; 8.2231]
Mean of new r