# Loading the Unsplash Research dataset in Pandas dataframes

This notebooks is an example of how to load the Unsplash Research dataset in Pandas dataframes for analysis.


## Loading libraries

In [1]:
import numpy as np
import pandas as pd
import glob

## Loading the datasets in Pandas

Make sure that you correctly point to the correct path.

In [2]:
path = './unsplash/'
documents = ['photos', 'keywords', 'collections', 'conversions', 'colors']
documents = ['conversions', 'photos']
datasets = {}

for doc in documents:
  files = glob.glob(path + doc + ".tsv*")

  subsets = []
  for filename in files:
    df = pd.read_csv(filename, sep='\t', header=0)
    subsets.append(df)

  datasets[doc] = pd.concat(subsets, axis=0, ignore_index=True)

In [None]:
datasets.keys()

In [None]:
len(datasets['photos']), len(datasets['keywords']), len(datasets['conversions'])

## Exploring the datasets

Here are the first couple of rows from each dataset, as an example.

Enjoy the exploration!

In [4]:
datasets['photos'].columns

Index(['photo_id', 'photo_url', 'photo_image_url', 'photo_submitted_at',
       'photo_featured', 'photo_width', 'photo_height', 'photo_aspect_ratio',
       'photo_description', 'photographer_username', 'photographer_first_name',
       'photographer_last_name', 'exif_camera_make', 'exif_camera_model',
       'exif_iso', 'exif_aperture_value', 'exif_focal_length',
       'exif_exposure_time', 'photo_location_name', 'photo_location_latitude',
       'photo_location_longitude', 'photo_location_country',
       'photo_location_city', 'stats_views', 'stats_downloads',
       'ai_description', 'ai_primary_landmark_name',
       'ai_primary_landmark_latitude', 'ai_primary_landmark_longitude',
       'ai_primary_landmark_confidence', 'blur_hash'],
      dtype='object')

In [12]:
datasets['photos'].head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash
0,bygTaBey1Xk,https://unsplash.com/photos/bygTaBey1Xk,https://images.unsplash.com/uploads/1413387620...,2014-10-15 15:40:40.111061,t,4635,3070,1.51,,jaspervandermeij,...,,,1708356,19085,sea and rock cliff with grasses under cloudy sky,Neist Point,57.428387,-6.783028,30.348906,LcE{wnIVRixt~WR+NGjbxukCWBWB
1,gXSFnk2a9V4,https://unsplash.com/photos/gXSFnk2a9V4,https://images.unsplash.com/reserve/jEs6K0y1Sb...,2014-07-10 18:36:06,t,2448,3264,0.75,Coastline view,kimberlyrichards,...,United States,Tillamook,9895033,74702,aerial photography of seashore,,,,,LXE4G#IARjj]GdWFxaWBDOxaofj[
2,grg6-DNJuaU,https://unsplash.com/photos/grg6-DNJuaU,https://images.unsplash.com/uploads/1412192004...,2014-10-01 19:33:56.393181,t,5184,3456,1.5,,marcusdallcol,...,,,8967968,38338,man surfboarding on ocean wave during daytime,,,,,LcHx?5R%Rjof01bHWBof4ooMoeax
3,sO42hhChB1c,https://unsplash.com/photos/sO42hhChB1c,https://images.unsplash.com/reserve/ijl3tATFRp...,2014-08-19 21:15:40,t,4896,3264,1.5,Hazy Ocean Waters,arturpokusin,...,,,2071752,10860,body of water,,,,,LyOzVsj[aefQ_4j[ayj[IUayj[ay
4,tkk8_HakQ98,https://unsplash.com/photos/tkk8_HakQ98,https://images.unsplash.com/reserve/6vaWXsQuSW...,2014-05-05 18:31:06,t,2000,1333,1.5,Silhouettes In Desert,carlov,...,,,2720281,9081,car on desert during sunset,,,,,"LYEV]I%19ZR+-=s,RkWW00WB%2j["


In [7]:
datasets['photos'].iloc[0][['photo_id', 'photo_url']].values

array(['bygTaBey1Xk', 'https://unsplash.com/photos/bygTaBey1Xk'],
      dtype=object)

In [None]:
datasets['photos'][datasets['photos']['photo_id'] == 'oMpAz-DN-9I']

In [None]:
datasets['photos'][datasets['photos']['photo_description'].notna()]

In [None]:
datasets['photos'][['photo_description', 'ai_description']].head(50)

In [None]:
urls = datasets['photos']['photo_image_url'][:10]

import requests
from io import BytesIO
from PIL import Image

for url in urls:
  response = requests.get(url)
  img = Image.open(BytesIO(response.content))
  display(img)


In [None]:
datasets['keywords'].groupby('photo_id').count()['keyword'].describe()

In [None]:
print(datasets['keywords'][datasets['keywords']['photo_id'] == '2EDjes2hlZo'].keyword.tolist())

In [None]:
datasets['keywords'][datasets['keywords']['photo_id'] == '2EDjes2hlZo'].sort_values(by=['ai_service_1_confidence'], ascending=False)

In [None]:
datasets['conversions'][datasets['conversions']['photo_id'] == '2EDjes2hlZo'].keyword.value_counts()

In [None]:
datasets['keywords']['keyword'].value_counts()['espuma del mar']

In [None]:
keywords = datasets['keywords']['keyword'].value_counts()
kwords = set(keywords[keywords > 10].index.tolist())

In [10]:
df_kw = datasets['keywords']
df_kw[df_kw['ai_service_1_confidence'].isnull()]

Unnamed: 0,photo_id,keyword,ai_service_1_confidence,ai_service_2_confidence,suggested_by_user
6,zzwTUqvzIFg,backgrounds,,,t
42,zzwTUqvzIFg,twig,,0.842625,f
45,zzwTUqvzIFg,branch,,0.877443,f
51,zzwTUqvzIFg,world,,0.907868,f
62,zzwTUqvzIFg,atmospheric phenomenon,,0.825251,f
...,...,...,...,...,...
2706909,--2IBUMom1I,travel,,,t
2706923,--2IBUMom1I,evening,,0.809363,f
2706938,--2IBUMom1I,view,,,t
2706947,--2IBUMom1I,spot,,,t


In [19]:
from collections import defaultdict
import tqdm

keywords_dict = defaultdict(list)
keywords_id_dict = defaultdict(defaultdict(float).copy)
for i, row in tqdm.tqdm(df_kw.iterrows()):
  val = 1. if row['suggested_by_user']=='t' else np.nanmean([row['ai_service_1_confidence']/100, row['ai_service_2_confidence']])
  keywords_dict[row['keyword']].append((row['photo_id'], val))
  keywords_id_dict[row['keyword']][row['photo_id']] = val
  # if i > 100:
  #   break
for k in keywords_dict:
  keywords_dict[k] = sorted(keywords_dict[k], key=lambda x: x[1], reverse=True)

keywords_dict['dream'][:10]

2706955it [05:19, 8471.24it/s]


[('zlGFO4rS4Yo', 1.0),
 ('zSGFGTed0zw', 1.0),
 ('xJqXTBt95tA', 1.0),
 ('we1tBosANpU', 1.0),
 ('vqcXGiIli4s', 1.0),
 ('vngzm4P2BTs', 1.0),
 ('tdUyC8pgjqo', 1.0),
 ('sohFyuTFFYc', 1.0),
 ('sHb5UyXumWA', 1.0),
 ('qs7JjH1yLJ0', 1.0)]

In [20]:
import pickle
with open('keywords_dict.pkl', 'wb') as f:
    pickle.dump(keywords_dict, f)
with open('keywords_id_dict.pkl', 'wb') as f:
    pickle.dump(keywords_id_dict, f)

In [33]:
all_images = datasets['photos']['photo_id'].unique().tolist()
len(all_images)

25000

In [34]:
import random

class SearchModel():
    def __init__(self, keyword_dict, keyword_id_dict, all_images):
        self.model = keyword_dict
        self.keyword_id_dict = keyword_id_dict
        self.all_images = all_images

    def search(self, keyword, k=10):
        if keyword in self.model:
            output = [image for image, _ in self.model[keyword][:k]]
            if len(output) < k:
                output += random.sample(self.all_images, k-len(output))
            return output
        else:
            return random.sample(self.all_images, k)

    def predict(self, keywords, images):
        if isinstance(keywords, list):
            return [self.keyword_id_dict[keyword][image] for keyword, image in zip(keywords, images)]
        return [self.keyword_id_dict[keywords][image] for image in images]
    
search_model = SearchModel(keywords_dict, keywords_id_dict, all_images)

In [8]:
df_conv = datasets['conversions']

In [38]:
from sklearn.preprocessing import KBinsDiscretizer, LabelEncoder
discretizer = KBinsDiscretizer(n_bins=12, encode='ordinal')
discretizer.fit_transform(np.array(datasets['photos'].stats_views).reshape(-1,1))
discretizer2 = KBinsDiscretizer(n_bins=12, encode='ordinal')
discretizer2.fit_transform(np.array(datasets['photos'].stats_downloads).reshape(-1,1))
photographer_label_encodder = LabelEncoder()
photographer_label_encodder.fit(datasets['photos'].photographer_username)
country_encoder = LabelEncoder()
country_encoder.fit(df_conv.conversion_country)

In [62]:
df_photos = datasets['photos'].copy()
df_photos['stats_views'] = discretizer.transform(np.array(datasets['photos'].stats_views).reshape(-1,1)).astype(int)
df_photos['stats_downloads'] = discretizer2.transform(np.array(datasets['photos'].stats_downloads).reshape(-1,1)).astype(int)
df_photos['photographer_username'] = photographer_label_encodder.transform(datasets['photos'].photographer_username)
df_photos = df_photos[['photo_id', 'photographer_username', 'stats_views', 'stats_downloads']]
df_photos

Unnamed: 0,photo_id,photographer_username,stats_views,stats_downloads
0,bygTaBey1Xk,3509,8,10
1,gXSFnk2a9V4,4243,11,11
2,grg6-DNJuaU,4959,11,11
3,sO42hhChB1c,713,9,8
4,tkk8_HakQ98,1244,9,8
...,...,...,...,...
24995,rZQq3bdOsJU,4430,2,5
24996,swQ3JS8e-Fs,2592,1,5
24997,pZroun8cH1w,8171,0,1
24998,JqOvq9ypB4w,4723,3,7


In [63]:
df_photos.to_csv('df_photos.csv', index=False)

In [64]:
dff = pd.read_csv('df_photos.csv')
dff

Unnamed: 0,photo_id,photographer_username,stats_views,stats_downloads
0,bygTaBey1Xk,3509,8,10
1,gXSFnk2a9V4,4243,11,11
2,grg6-DNJuaU,4959,11,11
3,sO42hhChB1c,713,9,8
4,tkk8_HakQ98,1244,9,8
...,...,...,...,...
24995,rZQq3bdOsJU,4430,2,5
24996,swQ3JS8e-Fs,2592,1,5
24997,pZroun8cH1w,8171,0,1
24998,JqOvq9ypB4w,4723,3,7


In [41]:
df_conv = df_conv.merge(df_photos, on='photo_id')
df_conv['conversion_country'] = country_encoder.transform(df_conv.conversion_country)
df_conv

Unnamed: 0,converted_at,conversion_type,keyword,photo_id,anonymous_user_id,conversion_country,photographer_username,stats_views,stats_downloads
0,2023-05-09 11:03:40.445,download,Mond,jlV2k_Fx0fc,4589085a-75df-417b-93de-22adf2fc627d,52,5630,11,11
1,2023-05-09 11:12:05.109,download,16.9 camel desert,yNGQ830uFB4,e05af0fe-4930-421d-b20d-f904f316e2c3,44,7616,11,11
2,2023-05-09 11:17:33.417,download,bird,BFsm5vldl2I,64fd6739-db67-46e0-99f2-022efb498447,178,7468,11,11
3,2023-05-09 11:32:03.943,download,night sky,-cKXtsJWU-I,2f9c6ac4-02c8-4d0f-82b3-0482a82ab0bf,97,3479,11,11
4,2023-05-09 11:36:56.557,download,zoom background office,CEeoDFpVxxw,a7abbff5-4a50-4c65-b463-18139e2978e9,97,5659,11,11
...,...,...,...,...,...,...,...,...,...
8827277,2023-12-16 22:19:11.714,download,sunny background,yJJpSoCB5YE,850453d7-3606-4f5a-88fc-3b2a982a9b33,231,6073,10,11
8827278,2023-12-16 23:45:55.757,download,horses,P6JRr7-FxLw,5ee21fcf-4dd7-494a-a5ae-eecfb1d08352,71,2837,10,11
8827279,2023-12-16 23:46:56.796,download,Frozen bubble,RgTI2KaQ5N4,eca8c283-0900-4885-9a06-65b034d438f0,216,97,11,11
8827280,2023-12-16 23:49:48.494,download,night portrait,JXUfF7HYfMo,344b0b91-cd1d-4475-b30a-2c351bd24091,97,3266,10,11


In [67]:
df_conv = df_conv.dropna(subset=['anonymous_user_id', 'keyword', 'photo_id'])
df_train, df_test = df_conv[:int(len(df_conv)*0.8)], df_conv[int(len(df_conv)*0.8):]
df_train.to_csv('df_train.csv', index=False)
df_test.to_csv('df_test.csv', index=False)

In [61]:
df_photos.loc[['grg6-DNJuaU']].to_numpy()

array([[4959,   11,   11]])

In [56]:
df_photos=df_photos.reset_index()
df_photos

Unnamed: 0,photo_id,photographer_username,stats_views,stats_downloads
0,bygTaBey1Xk,3509,8,10
1,gXSFnk2a9V4,4243,11,11
2,grg6-DNJuaU,4959,11,11
3,sO42hhChB1c,713,9,8
4,tkk8_HakQ98,1244,9,8
...,...,...,...,...
24995,rZQq3bdOsJU,4430,2,5
24996,swQ3JS8e-Fs,2592,1,5
24997,pZroun8cH1w,8171,0,1
24998,JqOvq9ypB4w,4723,3,7


In [51]:
additional_features = ['conversion_country',  'photographer_username', 'stats_views', 'stats_downloads']
features_sizes = df_conv[additional_features].max().to_numpy()+1
features_sizes

array([ 235, 8411,   12,   12])

In [50]:
[2, 3, *features_sizes]

[2, 3, 235, 8411, 12, 12]

In [33]:
import pickle
with open('image_lookup.pkl', 'rb') as f:
   all_images = pickle.load(f)
all_images[:5]

['bygTaBey1Xk', 'gXSFnk2a9V4', 'grg6-DNJuaU', 'sO42hhChB1c', 'tkk8_HakQ98']

In [26]:
np.array(datasets['photos'].stats_views).reshape(-1,1)

array([[1708356],
       [9895033],
       [8967968],
       ...,
       [ 135139],
       [ 419529],
       [  85967]])

In [17]:
# datasets['photos']['photographer_username'].value_counts()
datasets['photos']['photo_location_country'].value_counts()

photo_location_country
United States     2371
Canada             440
United Kingdom     414
Italy              369
Australia          340
                  ... 
Китай                1
Kroatien             1
Sqitzerland          1
Казахстан            1
U.K                  1
Name: count, Length: 449, dtype: int64

In [37]:
datasets['photos'][datasets['photos']['photographer_username'].isnull()]

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,photo_location_country,photo_location_city,stats_views,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash


In [10]:
df_conv.conversion_country.unique()

array(['DE', 'CN', 'RU', 'IN', 'CO', 'US', 'RE', 'AT', 'GB', 'BG', 'KR',
       'PL', 'MD', 'PH', 'AU', 'FR', 'NL', 'ID', 'SE', 'BR', 'CA', 'DK',
       'PK', 'BD', 'ES', 'KE', 'GR', 'ZA', 'IT', 'LU', 'MX', 'PT', 'AR',
       'RO', 'IE', 'GH', 'VE', 'JP', 'DZ', 'NZ', 'IQ', 'TR', 'VN', 'RS',
       'CH', nan, 'HR', 'FI', 'GN', 'LB', 'IL', 'MW', 'AE', 'UY', 'LT',
       'UA', 'LK', 'EG', 'NO', 'TH', 'SK', 'MY', 'NG', 'CR', 'HK', 'PY',
       'IS', 'NP', 'ZW', 'BE', 'TT', 'PS', 'SG', 'MM', 'MC', 'CG', 'CL',
       'TW', 'AZ', 'CZ', 'AL', 'GE', 'MN', 'KZ', 'BH', 'XK', 'UG', 'SA',
       'MA', 'BB', 'CU', 'HN', 'AF', 'PE', 'HU', 'AO', 'BN', 'TN', 'LV',
       'PA', 'EE', 'BY', 'IR', 'UZ', 'GT', 'EC', 'ME', 'MG', 'LC', 'MK',
       'KW', 'SI', 'YE', 'CY', 'LY', 'LA', 'OM', 'TZ', 'AM', 'DO', 'PG',
       'JO', 'BA', 'KH', 'SY', 'GU', 'MV', 'SN', 'MO', 'GY', 'MT', 'MZ',
       'JM', 'SO', 'ZM', 'QA', 'ET', 'HT', 'SV', 'KY', 'CM', 'GL', 'RW',
       'PR', 'BO', 'BW', 'MU', 'CI', 'NI', 'FJ', 'BM

In [37]:

from sklearn.metrics import ndcg_score

def precision_at_k(predicted, ground_truth):
    relevant = list(filter(lambda id: id in ground_truth, predicted))
    return len(relevant) / len(ground_truth)

num_keywords = 50
num_img_per_key = 200

top_keys = df_conv['keyword'].value_counts().head(num_keywords).index.tolist()
ndcg = []
p_at_k = []
for key in tqdm.tqdm(top_keys):
    df_conv_key = df_conv[df_conv['keyword'] == key]
    top_imgs = df_conv_key['photo_id'].value_counts().head(num_img_per_key).index.tolist()
    scores = search_model.predict(key, top_imgs)
    search_results = search_model.search(key, num_img_per_key)

    true_relevance = np.asarray(range(len(top_imgs), 0, -1))
    ndcg.append(ndcg_score([true_relevance], [scores]))
    p_at_k.append(precision_at_k(search_results, top_imgs))
    
print('NDCG:', np.mean(ndcg))
print('Precision at k:', np.mean(p_at_k))

100%|██████████| 50/50 [00:34<00:00,  1.43it/s]

NDCG: 0.8769388448788925
Precision at k: 0.1622816367265469





In [None]:
100%|██████████| 50/50 [00:36<00:00,  1.37it/s]
NDCG: 0.8366000973032921
Precision at k: 0.04800000000000001

In [39]:
df_conv = datasets['conversions']
df_train, df_test = df_conv[:int(len(df_conv)*0.8)], df_conv[int(len(df_conv)*0.8):]
df_test

Unnamed: 0,converted_at,conversion_type,keyword,photo_id,anonymous_user_id,conversion_country
7061825,2023-02-17 14:58:41.892,download,puppy,fk4tiMlDFF0,b6bd01ce-57c6-4262-b916-6814bef9e5ce,GB
7061826,2023-02-17 15:08:34.131,download,trees,Tu9NKqO4I70,23b3244c-7bc6-48f8-b85b-46ca7fd549dc,BE
7061827,2023-02-17 15:18:42.181,download,wolf,tauPAnOIGvE,4fb876b0-37f2-47a9-96ec-f85f9b1cec2d,FR
7061828,2023-02-17 15:28:01.358,download,prehistoric art,9VHCFWlvVzI,6e6845b7-5cf1-432a-b007-6679a8733d5d,PH
7061829,2023-02-17 15:29:52.944,download,scary,7CME6Wlgrdk,da2f370b-a751-4f57-ab1b-cf55a57ced28,US
...,...,...,...,...,...,...
8827277,2023-12-16 22:19:11.714,download,sunny background,yJJpSoCB5YE,850453d7-3606-4f5a-88fc-3b2a982a9b33,ZA
8827278,2023-12-16 23:45:55.757,download,horses,P6JRr7-FxLw,5ee21fcf-4dd7-494a-a5ae-eecfb1d08352,GB
8827279,2023-12-16 23:46:56.796,download,Frozen bubble,RgTI2KaQ5N4,eca8c283-0900-4885-9a06-65b034d438f0,US
8827280,2023-12-16 23:49:48.494,download,night portrait,JXUfF7HYfMo,344b0b91-cd1d-4475-b30a-2c351bd24091,IN


In [17]:
users = df_conv.anonymous_user_id.unique().tolist()
user_lookup = {user: i for i, user in enumerate(users)}
users[:5], user_lookup['344b0b91-cd1d-4475-b30a-2c351bd24091']

(['4589085a-75df-417b-93de-22adf2fc627d',
  'e05af0fe-4930-421d-b20d-f904f316e2c3',
  '64fd6739-db67-46e0-99f2-022efb498447',
  '2f9c6ac4-02c8-4d0f-82b3-0482a82ab0bf',
  'a7abbff5-4a50-4c65-b463-18139e2978e9'],
 4347533)

In [22]:
photo_ids = datasets['photos'].photo_id.unique().tolist()
photo_lookup = {photo: i for i, photo in enumerate(photo_ids)}
photo_ids[:5], photo_lookup['2EDjes2hlZo']

(['bygTaBey1Xk', 'gXSFnk2a9V4', 'grg6-DNJuaU', 'sO42hhChB1c', 'tkk8_HakQ98'],
 24701)

In [36]:
import pickle
with open('user_lookup.pkl', 'wb') as f:
    pickle.dump(users, f)
with open('image_lookup.pkl', 'wb') as f:
    pickle.dump(photo_ids, f)

In [23]:
len(users), len(photo_ids)

(4347535, 25000)

In [10]:
interactions = df_train.groupby(['anonymous_user_id', 'photo_id']).size()

In [24]:
interactions_ = interactions.copy()
interactions_ = interactions_.reset_index()
interactions_['anonymous_user_id'] = interactions_['anonymous_user_id'].map(user_lookup)
interactions_['photo_id'] = interactions_['photo_id'].map(photo_lookup)
interactions_


Unnamed: 0,anonymous_user_id,photo_id,0
0,3520741,13682,1
1,219061,14097,1
2,219061,11097,1
3,753596,9019,1
4,1150466,1400,1
...,...,...,...
6518695,894379,8911,1
6518696,894379,17277,1
6518697,1844122,18947,1
6518698,1844122,7520,1


In [38]:
df_test

Unnamed: 0,photo_id,anonymous_user_id
7061825,fk4tiMlDFF0,b6bd01ce-57c6-4262-b916-6814bef9e5ce
7061826,Tu9NKqO4I70,23b3244c-7bc6-48f8-b85b-46ca7fd549dc
7061827,tauPAnOIGvE,4fb876b0-37f2-47a9-96ec-f85f9b1cec2d
7061828,9VHCFWlvVzI,6e6845b7-5cf1-432a-b007-6679a8733d5d
7061829,7CME6Wlgrdk,da2f370b-a751-4f57-ab1b-cf55a57ced28
...,...,...
8827277,yJJpSoCB5YE,850453d7-3606-4f5a-88fc-3b2a982a9b33
8827278,P6JRr7-FxLw,5ee21fcf-4dd7-494a-a5ae-eecfb1d08352
8827279,RgTI2KaQ5N4,eca8c283-0900-4885-9a06-65b034d438f0
8827280,JXUfF7HYfMo,344b0b91-cd1d-4475-b30a-2c351bd24091


In [41]:
test_interactions = df_test.groupby(['anonymous_user_id', 'keyword', 'photo_id']).size()
test_interactions2 = test_interactions.copy().reset_index()
test_interactions2

Unnamed: 0,anonymous_user_id,keyword,photo_id,0
0,0000087e-aae8-4276-a26f-5b856a2ea8fb,photo,pDGNBK9A0sk,3
1,00000e78-8546-469c-8c52-1e39c767b54b,ocean,L-2p8fapOA8,1
2,0000261c-db4a-4040-b56e-1bac0446efc5,mountains,lpjb_UMOyx8,1
3,0000786a-db14-4e70-a8a5-5060e012543a,(Kings Canyon National Parks,svOht0lwbyk,1
4,0000786a-db14-4e70-a8a5-5060e012543a,Dunedin,nlF8qaFWI0w,1
...,...,...,...,...
1656629,ffffae90-7804-4fce-878f-1a3bed29ad18,new york,HhmCIJTLuGY,1
1656630,ffffb3a0-2c20-47dd-a871-7f87dc261e0d,fish sauce,wPJygs79jMA,1
1656631,ffffb520-e6de-4247-aeff-4648b3e8d82b,water background,XexawgzYOBc,1
1656632,ffffc7e8-51df-46e9-bf26-45916adc9ca3,fantasy world,pYyOZ8q7AII,2


In [56]:
test_interactions2 = df_test

In [57]:
test_interactions2['user_keyword'] = test_interactions2[['anonymous_user_id', 'keyword']].agg(tuple, axis=1)
test_interactions2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_interactions2['user_keyword'] = test_interactions2[['anonymous_user_id', 'keyword']].agg(tuple, axis=1)


Unnamed: 0,converted_at,conversion_type,keyword,photo_id,anonymous_user_id,conversion_country,user_keyword
7061825,2023-02-17 14:58:41.892,download,puppy,fk4tiMlDFF0,b6bd01ce-57c6-4262-b916-6814bef9e5ce,GB,"(b6bd01ce-57c6-4262-b916-6814bef9e5ce, puppy)"
7061826,2023-02-17 15:08:34.131,download,trees,Tu9NKqO4I70,23b3244c-7bc6-48f8-b85b-46ca7fd549dc,BE,"(23b3244c-7bc6-48f8-b85b-46ca7fd549dc, trees)"
7061827,2023-02-17 15:18:42.181,download,wolf,tauPAnOIGvE,4fb876b0-37f2-47a9-96ec-f85f9b1cec2d,FR,"(4fb876b0-37f2-47a9-96ec-f85f9b1cec2d, wolf)"
7061828,2023-02-17 15:28:01.358,download,prehistoric art,9VHCFWlvVzI,6e6845b7-5cf1-432a-b007-6679a8733d5d,PH,"(6e6845b7-5cf1-432a-b007-6679a8733d5d, prehist..."
7061829,2023-02-17 15:29:52.944,download,scary,7CME6Wlgrdk,da2f370b-a751-4f57-ab1b-cf55a57ced28,US,"(da2f370b-a751-4f57-ab1b-cf55a57ced28, scary)"
...,...,...,...,...,...,...,...
8827277,2023-12-16 22:19:11.714,download,sunny background,yJJpSoCB5YE,850453d7-3606-4f5a-88fc-3b2a982a9b33,ZA,"(850453d7-3606-4f5a-88fc-3b2a982a9b33, sunny b..."
8827278,2023-12-16 23:45:55.757,download,horses,P6JRr7-FxLw,5ee21fcf-4dd7-494a-a5ae-eecfb1d08352,GB,"(5ee21fcf-4dd7-494a-a5ae-eecfb1d08352, horses)"
8827279,2023-12-16 23:46:56.796,download,Frozen bubble,RgTI2KaQ5N4,eca8c283-0900-4885-9a06-65b034d438f0,US,"(eca8c283-0900-4885-9a06-65b034d438f0, Frozen ..."
8827280,2023-12-16 23:49:48.494,download,night portrait,JXUfF7HYfMo,344b0b91-cd1d-4475-b30a-2c351bd24091,IN,"(344b0b91-cd1d-4475-b30a-2c351bd24091, night p..."


In [59]:
usercounts = test_interactions2.user_keyword.value_counts()
usercounts[usercounts >= 10]

user_keyword
(17a84ce7-79b4-4a60-9e35-4a30098c0264, flower)                    242
(3b4d0c29-c906-4b2d-969f-9c6326bf0d6e, Isländische Landschaft)    200
(0444b2dd-a370-42a6-85e4-022ff3619f0b, nature)                    184
(06126be7-064f-4be8-8680-d5f66f746d73, fish)                      167
(864bb1d0-5e02-47eb-83b1-d4e5c0c4cbb0, nature forest)             154
                                                                 ... 
(adf84e01-7eac-4e9d-993e-f388f6c16929, storm)                      10
(62dc8254-6dbd-420a-a482-9d0c33591b15, floral)                     10
(89243ee6-9c8a-4937-a55f-b6e94edb55d1, wood)                       10
(0ab59331-03f8-40bd-b27f-484755b28153, storm)                      10
(216e6deb-4983-4db1-ad9d-2038ca896d55, simple)                     10
Name: count, Length: 3426, dtype: int64

In [63]:
usercounts[usercounts >= 10].index.tolist()

[('17a84ce7-79b4-4a60-9e35-4a30098c0264', 'flower'),
 ('3b4d0c29-c906-4b2d-969f-9c6326bf0d6e', 'Isländische Landschaft'),
 ('0444b2dd-a370-42a6-85e4-022ff3619f0b', 'nature'),
 ('06126be7-064f-4be8-8680-d5f66f746d73', 'fish'),
 ('864bb1d0-5e02-47eb-83b1-d4e5c0c4cbb0', 'nature forest'),
 ('1dcf0643-a2a8-411a-8a50-692ceceedf7a', 'nature'),
 ('1f6e3590-b666-451f-9642-53e62e3237c4', 'landscape'),
 ('ccaba1b7-4507-49aa-b7ed-03d598c2c71d', 'fish'),
 ('bc0cd42b-49f6-4635-b3a6-e58aa257c8d7', 'butterfly'),
 ('8fb16a65-615e-4549-a0ae-f95985739b6b', 'plants'),
 ('c612676f-5967-4ced-8784-3dff9dce8aec', 'nature'),
 ('853b80e7-5bcd-4135-a0f4-2b36677b78b8', 'bale mountains'),
 ('599f67ae-89cc-4010-9d2a-05e185c434bc', '자연'),
 ('d9b0e965-4489-4489-84b1-c252aa4c851b', 'nature'),
 ('0ead30f1-45be-468a-86dc-241790827d4d', 'mountain'),
 ('42278d6a-5400-4631-b7e9-3f82b7b2d8f3', 'nature'),
 ('9de26553-505d-4dc2-9d92-db887b8f0f8a', 'nature'),
 ('a5da4fdb-9a17-4e01-b590-503edd6939c1', '自然'),
 ('b358c19f-db03-4b

In [65]:
user, kw = ('17a84ce7-79b4-4a60-9e35-4a30098c0264', 'flower') 
o = test_interactions2[(test_interactions2['anonymous_user_id'] == user) & (test_interactions2['keyword'] == kw)].photo_id.value_counts()
o.index, o.values

(Index(['YDNvydD1jAY', 'sZj8qOt3yTU', 'UrgYjACxvaI', 'HQOA0LA91As',
        'tv-d1R3FItE', 'MPaHCR6D7Po', '3qsrd-4-_pU', 'rxdNnhMPRGE',
        'qDdv33C-BSw', '9Z1KRIfpBTM',
        ...
        'cOQP9StWSUA', 'Fw55DEkH6pg', 'r2SY2zsBmgM', 'bYuI23mnmDQ',
        'NUX8vT_lkWI', '_MJzPKfbcSI', '6BykweIpp8s', 'zvqm-lkh_4s',
        'qZzJoiKHqmo', 'al_dEsITyBU'],
       dtype='object', name='photo_id', length=108),
 array([4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
        3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))

In [52]:
top_50_user_ids = test_interactions2.anonymous_user_id.value_counts().head(50).index.tolist()
# user_id = top_50_user_ids[40]
user_id = 'ce4259ff-fb32-46ef-b3f0-0788cf0db441'
test_interactions2[test_interactions2['anonymous_user_id'] == user_id].keyword.value_counts().head(10)
test_interactions2[(test_interactions2['anonymous_user_id'] == user_id) & (test_interactions2['keyword'] == '자연')].photo_id.value_counts()

photo_id
1h2Pg97SXfA    1
6-C0VRsagUw    1
78A265wPiO4    1
7AcMUSYRZpU    1
9DHyVy-G1rM    1
9JxubXPaidg    1
DD1fSz2HF1s    1
MMJx78V7xS8    1
N7-wxyaIxFs    1
TApAkERW5pQ    1
T_Qe4QlMIvQ    1
V9rsiNN5flQ    1
YvkH8R1zoQM    1
ZVbv1akA-l4    1
_hpk_92Crhs    1
a8lTjWJJgLA    1
aaIN3y2zcMQ    1
bJHWJeiHfHc    1
buF62ewDLcQ    1
cssvEZacHvQ    1
eOpewngf68w    1
hOhlYhAiizc    1
hUp58GsPKAw    1
hnw3Al47-KE    1
i9Q9bc-WgfE    1
jFCViYFYcus    1
lpjb_UMOyx8    1
mFbrMEVKCkc    1
mOcdke2ZQoE    1
oyVwqeEi70o    1
p3OzJuT_Dks    1
phIFdC6lA4E    1
pp_oXEb2H48    1
tNDvFkxkBHo    1
ugnrXk1129g    1
v4e3JI7DDHI    1
vngzm4P2BTs    1
vwcxco7o564    1
wQImoykAwGs    1
xg8z_KhSorQ    1
y0i1lh-T0_w    1
Name: count, dtype: int64

In [35]:
import scipy
sparse_mat = scipy.sparse.coo_matrix((interactions.values, (interactions_.photo_id, interactions_.anonymous_user_id)),
                                     shape=(len(photo_ids), len(users)))
sparse_mat
scipy.sparse.save_npz('sparse_matrix.npz', sparse_mat)

In [34]:
from implicit.nearest_neighbours import bm25_weight

# weight the matrix, both to reduce impact of users that have played the same artist thousands of times
# and to reduce the weight given to popular items
interaction_mat = bm25_weight(sparse_mat.tocsr(), K1=100, B=0.8)

# get the transpose since the most of the functions in implicit expect (user, item) sparse matrices instead of (item, user)
user_downlaods = interaction_mat.T.tocsr()

uwer_downloads

ModuleNotFoundError: No module named 'implicit'

In [33]:
sparse_mat.tocsr()[0,2335029]

4

In [5]:
df_test = df_test[['photo_id', 'anonymous_user_id']]
df_test.sparse

AttributeError: Can only use the '.sparse' accessor with Sparse data.

In [8]:
# user_id = df_test.anonymous_user_id.value_counts().index[0]
df_test[df_test['anonymous_user_id'] == user_id].photo_id.value_counts()

photo_id
OQsxdghBKrU    5
phIFdC6lA4E    5
eUFfY6cwjSU    5
ig-lw0Dtz34    5
0juC5JIhPks    5
              ..
-3LtGq_RPcY    1
Q7QM2WSOTs4    1
uzwTVzXqZcg    1
QELYdk58wOo    1
IuSemNxGS88    1
Name: count, Length: 307, dtype: int64

In [3]:
from surprise import Dataset, NormalPredictor, Reader
from surprise.model_selection import cross_validate


# Creation of the dataframe. Column names are irrelevant.
ratings_dict = {
    "itemID": [1, 1, 1, 2, 2],
    "userID": [9, 32, 2, 45, "user_foo"],
    "rating": [3, 2, 4, 3, 1],
}
df = pd.DataFrame(ratings_dict)

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(1, 5))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(df[["userID", "itemID", "rating"]], reader)

# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(NormalPredictor(), data, cv=2)

ModuleNotFoundError: No module named 'surprise'

In [None]:
datasets['collections'].head(30)

In [None]:
datasets['conversions'].groupby('photo_id').count()

In [None]:
datasets['conversions'].head(50)
df_conv = datasets['conversions']['keyword', 'photo_id']

In [None]:
conversion_photos = datasets['conversions']['photo_id'].value_counts()
conversion_photos[conversion_photos > 500]

In [None]:
id = 'oMpAz-DN-9I'
datasets['conversions'][datasets['conversions'].photo_id == id]

In [None]:
datasets['conversions'].describe()

In [None]:
datasets['conversions']['keyword'].value_counts()[:20].plot.bar()

In [None]:
datasets['conversions']['conversion_country'].value_counts()[:20].plot.bar()

In [None]:
datasets['photos'].boxplot(column=[ 'stats_downloads'])

In [None]:
datasets['conversions']['anonymous_user_id'].value_counts()

In [None]:
datasets['colors'].head()