In [1]:
import pandas as pd
import numpy as np

In [2]:
ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.genres.str.contains('Fantasy').sum()

654

In [5]:
joined = ratings.merge(movies, on='movieId', how='left')

In [6]:
joined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama


In [7]:
len(ratings) == len(joined)

True

In [8]:
direct_stats = pd.read_csv('direct_stats.tsv', sep='\t')
direct_stats.head()

Unnamed: 0,date,campaign,views,clicks,cost
0,2018-01-01,landings_promo,38120423,49557,1139801
1,2018-01-01,homepage_partner_1,5729483,12605,189073
2,2018-01-01,homepage_partner_2,4412029,9265,176040
3,2018-01-01,socdem_w_25-34_vip_test,913823,2559,89555
4,2018-01-02,landings_promo,40873806,61311,1471457


In [9]:
crm_stats = pd.read_csv('crm_stats.tsv', sep='\t')
crm_stats.head()

Unnamed: 0,date,campaign,orders
0,2018-01-01,landings_promo,1487
1,2018-01-01,homepage_partner_1,386
2,2018-01-01,homepage_partner_2,315
3,2018-01-01,socdem_w_25-34_vip_test,85
4,2018-01-02,landings_promo,1605


In [10]:
joined_data = direct_stats.merge(crm_stats, on=['date', 'campaign'])

In [11]:
joined_data.head()

Unnamed: 0,date,campaign,views,clicks,cost,orders
0,2018-01-01,landings_promo,38120423,49557,1139801,1487
1,2018-01-01,homepage_partner_1,5729483,12605,189073,386
2,2018-01-01,homepage_partner_2,4412029,9265,176040,315
3,2018-01-01,socdem_w_25-34_vip_test,913823,2559,89555,85
4,2018-01-02,landings_promo,40873806,61311,1471457,1605


In [12]:
joined_data['cost_per_day'] = joined_data.cost / joined_data.orders

In [13]:
joined_data.head()

Unnamed: 0,date,campaign,views,clicks,cost,orders,cost_per_day
0,2018-01-01,landings_promo,38120423,49557,1139801,1487,766.510424
1,2018-01-01,homepage_partner_1,5729483,12605,189073,386,489.826425
2,2018-01-01,homepage_partner_2,4412029,9265,176040,315,558.857143
3,2018-01-01,socdem_w_25-34_vip_test,913823,2559,89555,85,1053.588235
4,2018-01-02,landings_promo,40873806,61311,1471457,1605,916.795639


In [14]:
round(joined_data[(joined_data['campaign'] == 'landings_promo') & (joined_data['date'] == '2018-01-01')]['cost_per_day'])

0    767.0
Name: cost_per_day, dtype: float64

In [21]:
genres = ['Drama', 'Action', 'Thriller', 'Comedy', 'Romance', 'War', 'Mystery', 'Crime']

def genres_ratings(row):
    return pd.Series([row['rating'] if genre in row['genres'] else np.NaN for genre in genres])

joined[genres] = joined.apply(genres_ratings, axis=1)

# Определяем, сколько оценок имеет жанр с наименьшим кол-вом оценок
for genre in genres:
    print(genre, len(joined[genre]) - joined[genre].isna().sum())

Drama 44752
Action 27056
Thriller 25240
Comedy 38026
Romance 19336
War 5025
Mystery 7625
Crime 16266


In [24]:
joined[genres] = joined.apply(genres_ratings, axis=1)
joined.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,Drama,Action,Thriller,Comedy,Romance,War,Mystery,Crime
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama,2.5,,,,,,,
1,1,1029,3.0,1260759179,Dumbo (1941),Animation|Children|Drama|Musical,3.0,,,,,,,
2,1,1061,3.0,1260759182,Sleepers (1996),Thriller,,,3.0,,,,,
3,1,1129,2.0,1260759185,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,,2.0,2.0,,,,,
4,1,1172,4.0,1260759205,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,4.0,,,,,,,


In [25]:
for genre in genres:
    print('{} mean rating {:.2f}'.format(genre, joined[genre].mean()))

Drama mean rating 3.68
Action mean rating 3.45
Thriller mean rating 3.52
Comedy mean rating 3.45
Romance mean rating 3.56
War mean rating 3.82
Mystery mean rating 3.68
Crime mean rating 3.68
