In [5]:
import numpy as np
import pandas as pd
#import modin.pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import sparse
from tqdm import tqdm

%matplotlib inline

In [6]:
to_list = lambda rubrics: [int(rubric) for rubric in str(rubrics).split(' ')]
def apply_to_columns(df, columns, func=to_list):
    for column in columns:
        df.loc[~df[column].isnull(), column] = df.loc[~df[column].isnull(), column].apply(func)

In [7]:
users = pd.read_csv('users.csv') # Информация о городе проживания пользователя:
print('Users: ', users['user_id'].nunique())
users.head(3)

Users:  1252801


Unnamed: 0,user_id,city
0,523295021912509756,msk
1,11952159487361099606,msk
2,16879036589969590999,msk


In [8]:
aspects = pd.read_csv('aspects.csv')
aspects.sample(5)

Unnamed: 0,aspect_id,aspect_name
127,361,Чебуречная
7,8,Кальян
120,352,Ночной Клуб
112,329,Необычный Интерьер
131,366,Атмосферно


In [9]:
orgs = pd.read_csv('organisations.csv')
print('orgs', len(orgs))
columns = ['rubrics_id', 'features_id'] # create lists
apply_to_columns(orgs, columns)

orgs.head(5)

orgs 66405


Unnamed: 0,org_id,city,average_bill,rating,rubrics_id,features_id
0,16848414477362211020,spb,1000.0,4.479702,"[30776, 31375]","[1018, 1509, 11177, 11617, 11629, 11704, 11867..."
1,1430604733320164116,spb,1000.0,4.514509,"[30776, 30770]","[246, 1018, 11617, 11629, 11704, 11867, 20422,..."
2,9880309324224147401,spb,1000.0,3.884615,"[30770, 30774]","[1018, 11177, 11617, 11629, 11704, 11867, 2042..."
3,5617879987171966456,spb,1000.0,,"[30774, 30775]","[1018, 1509, 10596, 11177, 11629, 11634, 11704..."
4,5241461680470612149,spb,1000.0,4.532468,[30776],"[1018, 11177, 11617, 11629, 11704, 11867, 2042..."


In [10]:
def count_feature(x):
    try: return len(x)
    except: return 0

orgs['feature_count'] = orgs['features_id'].apply(count_feature)
orgs.rename(columns = {'feature_count':'org_feature_count',
                      'features_id':'org_features_id'}, inplace=True)

orgs.sample(3)

Unnamed: 0,org_id,city,average_bill,rating,rubrics_id,org_features_id,org_feature_count
46063,7645213612172245484,msk,500.0,4.647059,[31495],"[1018, 11177, 11629, 11704, 11867, 20422, 2734...",7
8908,2367540157416265328,msk,500.0,3.815603,"[30777, 31286, 30774]","[1018, 11617, 20422]",3
34538,9759171774087584206,msk,2000.0,4.212219,"[30775, 30776]","[246, 1018, 1509, 11177, 11629, 11704, 11867, ...",9


In [11]:
reviews = pd.read_csv('reviews.csv', low_memory=False)

print('Reviews: ', len(reviews))
print('Users review: ', reviews.user_id.nunique())
print('Ogs review: ', reviews.org_id.nunique())

# encode users ids as numeric
print('Приклеим информацию по юзерам')
reviews = reviews.merge(users, on='user_id', how='left')
reviews = reviews.rename({'city' : 'user_city'}, axis=1)

print('Приклеим информацию по организациям')
orgs.rename({'rating': 'org_rating'}, axis=1, inplace=True)
reviews = reviews.merge(orgs[['org_id', 'city', 'org_rating', 'org_feature_count','average_bill',"rubrics_id"]], on='org_id', how='left') # encode orgs ids as numeric
reviews = reviews.rename({'city': 'org_city'}, axis=1)

columns = ['aspects'] # create lists
apply_to_columns(reviews, columns)

print('Удалим дубли')
len_before_drop_duplics = len(reviews)
reviews.drop_duplicates(['user_id', 'org_id', 'rating', 'ts','user_city', 'org_city'], inplace=True) # Удаляем дубли
len_after_drop_duplics = len(reviews)
print('Выброшено дублей: ', len_before_drop_duplics - len_after_drop_duplics)

reviews.sample(7)

Reviews:  3640835
Users review:  1252801
Ogs review:  66405
Приклеим информацию по юзерам
Приклеим информацию по организациям
Удалим дубли
Выброшено дублей:  58038


Unnamed: 0,user_id,org_id,rating,ts,aspects,user_city,org_city,org_rating,org_feature_count,average_bill,rubrics_id
949787,7008352811675445222,11203554873522367270,4.0,758,,msk,msk,3.940115,9,,[30771]
1596144,11589269367304202968,17856941558984703627,5.0,1195,,msk,msk,3.87,9,1000.0,"[30774, 30776]"
91709,5823891498847881476,4778021444224481854,5.0,779,,msk,msk,4.562173,10,,[30774]
3401104,10034399934379136157,15917924804407781285,5.0,706,,spb,spb,4.0,4,,"[30774, 31401]"
891603,13000979092339535420,15852093650986928757,3.0,1053,,msk,msk,4.549801,7,,[30770]
2862898,6338836121661467223,12046097390037935713,5.0,587,,spb,spb,4.51892,7,,[30776]
895449,10434247613307835722,15937407791431111086,5.0,1025,,msk,spb,3.911017,9,500.0,"[30774, 30771]"


In [113]:
#msk_orgs = train_reviews[(train_reviews['rating'] >= 4) & (train_reviews['org_city'] == 'msk')]['org_id']

reviews_new = reviews[(reviews['org_rating']>4.9) & (reviews['rating']>0)& (reviews['user_city'] != reviews['org_city'])][['org_id','aspects','org_city','user_city','rating']]

reviews_new = reviews_new.explode('aspects')
reviews_new['aspects'].unique()

reviews_new = reviews_new.dropna()
reviews_new = reviews_new[['aspects','rating']]
reviews_new

Unnamed: 0,aspects,rating
4720,10,5.0
10713,8,5.0
10713,267,5.0
10713,312,5.0
43330,4,5.0
...,...,...
3508328,11,5.0
3554147,10,5.0
3576427,7,5.0
3576427,366,5.0


In [87]:
#reviews_city = reviews_new[(reviews_new['org_city']=='spb')]
reviews_city =reviews_new['aspects'].value_counts()
reviews_city

10     27
8      13
6       9
4       8
38      8
267     7
11      6
307     6
9       5
7       4
5       4
17      4
19      4
14      3
312     3
249     2
34      2
302     2
2       2
326     2
3       2
23      1
247     1
306     1
253     1
16      1
1       1
301     1
366     1
282     1
278     1
22      1
12      1
254     1
Name: aspects, dtype: int64

In [123]:
reviews_new_netourist = reviews[(reviews['org_rating']>4.9) & (reviews['rating']>0)& (reviews['user_city'] == reviews['org_city'])][['org_id','aspects','org_city','user_city','rating']]

reviews_new_netourist = reviews_new_netourist.explode('aspects')
reviews_new_netourist['aspects'].unique()

reviews_new_netourist = reviews_new_netourist.dropna()

reviews_new_netourist= reviews_new_netourist[['aspects','rating']]
reviews_new_netourist
sns.barplot(reviews_new_netourist,x='aspects',y='rating')



ValueError: Could not interpret input 'rating'

In [108]:
reviews_new_netourist=reviews_new_netourist['aspects'].value_counts()
reviews_new_netourist

10     756
6      294
38     271
8      180
307    166
      ... 
325      1
324      1
311      1
319      1
270      1
Name: aspects, Length: 100, dtype: int64

In [105]:
#pd.DataFrame(reviews_city.to_dict())
tourist = reviews_city
netourist = reviews_new_netourist
D = pd.DataFrame()
D['count'] = tourist
D['ascp'] = tourist.index
D[~D['ascp'].isin(netourist.index)]
D

Unnamed: 0,count,ascp
10,27,10
8,13,8
6,9,6
4,8,4
38,8,38
267,7,267
11,6,11
307,6,307
9,5,9
7,4,7


In [104]:
# P = pd.DataFrame()
# P['count_2'] = netourist
# P['ascpects_2'] = netourist.index
P

Unnamed: 0,count_2,ascpects_2
10,737,10
6,288,6
38,270,38
8,180,8
307,161,307
...,...,...
325,1,325
324,1,324
20,1,20
319,1,319


In [12]:
#
orgs_rubrics = pd.read_csv('organisations.csv', low_memory=False)

# encode users ids as numeric
#reviews = reviews.merge(users, on='user_id')
#reviews = reviews.rename({'city': 'user_city'}, axis=1)
orgs_rubrics.drop(columns=["average_bill","features_id"]) 
tourist_reviews = reviews.merge(orgs_rubrics, on='org_id')
#reviews = reviews.merge(users, on='user_id')
#reviews = reviews.rename({'city': 'user_city'}, axis=1)
#count = tourist_reviews['rubrics_id'].value_counts()
tourist_reviews.head()

rubs = tourist_reviews[['user_id','org_id']]
rubs = rubs.explode('org_id')

#rubs.groupby('rubrics_id_x', as_index=False).agg('count')
rubs_new = rubs.groupby('org_id', as_index=False).agg(count_user=('user_id','count'))
rubs_new

Unnamed: 0,org_id,count_user
0,631353538013709,2
1,1339741300939226,1
2,1626686180038669,345
3,2431874799980829,35
4,2763990915087178,8
...,...,...
66400,18445240262487615744,2
66401,18445312739512196846,61
66402,18446174625303492010,3
66403,18446224790061093435,27
