In [42]:
from __future__ import print_function
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns
%matplotlib inline
plt.rcParams["figure.figsize"] = (15, 8)
pd.options.display.float_format = '{:.2f}'.format

In [43]:
fp = '/media/d_500/repos/otus-dm/m2/lecture_11_dimred/orders.csv'
df = pd.read_csv(fp, nrows=100000)
df['date_insert'] = pd.to_datetime(df['date_insert'])

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 11 columns):
DISCOUNT_VALUE        99764 non-null object
NAME                  100000 non-null object
PRICE                 100000 non-null float64
QUANTITY              100000 non-null int64
date_insert           100000 non-null datetime64[ns]
discount              100000 non-null float64
itemID                100000 non-null int64
orderID               100000 non-null int64
ordered_item_count    100000 non-null int64
price                 100000 non-null float64
user_id               100000 non-null int64
dtypes: datetime64[ns](1), float64(3), int64(5), object(2)
memory usage: 8.4+ MB


In [45]:
df[['PRICE', 'QUANTITY', 'discount', 'ordered_item_count', 'price']].head()

Unnamed: 0,PRICE,QUANTITY,discount,ordered_item_count,price
0,357.0,1,110.0,1,527.0
1,448.0,1,19.0,1,618.0
2,414.0,1,65.0,4,1503.0
3,253.0,1,65.0,4,1503.0
4,422.0,1,65.0,4,1503.0


In [46]:
df.head()

Unnamed: 0,DISCOUNT_VALUE,NAME,PRICE,QUANTITY,date_insert,discount,itemID,orderID,ordered_item_count,price,user_id
0,23.55%,Хроники Амбера. Том 2,357.0,1,2017-03-19 22:25:20,110.0,169108,602051,1,527.0,911531
1,4.07%,Хроники Амбера. Том 1,448.0,1,2017-03-19 22:19:58,19.0,1394744,602047,1,618.0,911531
2,4.17%,Пищеводитель,414.0,1,2017-03-19 20:58:00,65.0,191276,602017,4,1503.0,911587
3,4.17%,Аудиокн. Мясников. &quot;Ржавчина&quot;: что д...,253.0,1,2017-03-19 20:58:00,65.0,157945,602017,4,1503.0,911587
4,4.09%,Инфекции. Как защитить себя и своего ребенка,422.0,1,2017-03-19 20:58:00,65.0,205173,602017,4,1503.0,911587


In [47]:
df.duplicated().sum()

6

In [48]:
df.drop_duplicates(inplace = True)

In [49]:
from collections import defaultdict
from itertools import combinations

In [50]:
def to_set(x):
    return set(x)

def to_set_len(x):
    return len(set(x))

product_orders = df[df['ordered_item_count'] > 1][['itemID', 'orderID']].groupby('itemID', as_index=False).agg([to_set, to_set_len]).reset_index()

In [51]:
product_orders.columns = ['.'.join(filter(None, pair)) for pair in zip(product_orders.columns.get_level_values(0), product_orders.columns.get_level_values(1))]


In [52]:
product_orders.head(10)

Unnamed: 0,itemID,orderID.to_set,orderID.to_set_len
0,131823,"{603849, 612671}",2
1,132062,"{615475, 617604}",2
2,132111,{601249},1
3,132255,{614717},1
4,132303,{607278},1
5,132331,{599088},1
6,132342,"{620440, 618404, 620062}",3
7,132413,{613705},1
8,132445,"{615442, 603003, 606932, 614554, 607055}",5
9,132493,{610926},1


In [53]:
l = len(product_orders[product_orders['orderID.to_set_len'] > 20])
l*(l-1)/2.

77421.0

In [55]:
product_orders_top = product_orders[product_orders['orderID.to_set_len'] > 20]
product_orders_top.head()

Unnamed: 0,itemID,orderID.to_set,orderID.to_set_len
54,133565,"{601477, 601992, 606606, 604944, 599057, 60200...",22
218,137437,"{608646, 600712, 600077, 622096, 610458, 60740...",25
237,137742,"{602786, 607458, 615068, 609444, 614917, 59834...",21
275,138302,"{599554, 599428, 620576, 599432, 600457, 61441...",43
377,139636,"{599172, 603910, 600200, 600457, 600336, 59880...",28


In [56]:
product_orders_dict = dict(zip(product_orders_top.itemID, product_orders_top['orderID.to_set']))

In [57]:
product_pairs = list(combinations(sorted(product_orders_dict.keys()), 2))

In [58]:
product_pairs[:10]

[(133565, 137437),
 (133565, 137742),
 (133565, 138302),
 (133565, 139636),
 (133565, 139768),
 (133565, 140821),
 (133565, 142253),
 (133565, 142263),
 (133565, 142398),
 (133565, 142514)]

In [59]:
def jaccard(s1, s2):
    return len(s1 & s2) * 1. / len(s1 | s2)

In [60]:
similarities = []
for i1, i2 in product_pairs:
    similarities.append((i1, i2, jaccard(product_orders_dict[i1], product_orders_dict[i2])))

In [61]:
len(similarities)

77421

In [62]:
len(filter(lambda x: x[2] > 0.001, similarities))

8045

In [63]:
similar = filter(lambda x: x[2] > 0.001, similarities)

In [64]:
similar.sort(key=lambda x: x[2], reverse=True)

In [75]:
similar[-10:]

[(238178, 1401635, 0.006289308176100629),
 (597726, 1401635, 0.006172839506172839),
 (1401635, 1440580, 0.006024096385542169),
 (1401635, 1446979, 0.006024096385542169),
 (1417661, 1423007, 0.005917159763313609),
 (724547, 1401635, 0.005780346820809248),
 (553103, 1401635, 0.005681818181818182),
 (713988, 1401635, 0.005649717514124294),
 (1392862, 1401635, 0.005555555555555556),
 (1401635, 1417661, 0.005154639175257732)]

In [29]:
df[df['itemID'] == 713147][:1]

Unnamed: 0,DISCOUNT_VALUE,NAME,PRICE,QUANTITY,date_insert,discount,itemID,orderID,ordered_item_count,price,user_id
6234,4.43%,Математика. 2 класс. Рабочая тетрадь №1,194.0,1,2017-03-19 15:52:08,138.0,713147,601904,13,3070.0,911496


In [66]:
df[df['itemID'] == 713149][:1]

Unnamed: 0,DISCOUNT_VALUE,NAME,PRICE,QUANTITY,date_insert,discount,itemID,orderID,ordered_item_count,price,user_id
6235,4.43%,Математика. 2 класс. Рабочая тетрадь №2,194.0,1,2017-03-19 15:52:08,138.0,713149,601904,13,3070.0,911496


In [67]:
idx = sorted(df['itemID'])

In [68]:
sim_df = pd.DataFrame(similarities + [(i, i, 1) for i in idx], columns=['a', 'b', 'distance'])
sim_df.head()

Unnamed: 0,a,b,distance
0,133565,137437,0.0
1,133565,137742,0.0
2,133565,138302,0.0
3,133565,139636,0.0
4,133565,139768,0.0


In [87]:
pd.options.display.float_format = '{:f}'.format
len(sim_df[sim_df['distance'] > 0])

108039

In [69]:
len(sim_df)

177415

In [90]:
sim_df[sim_df['distance'] > 0].to_csv('similarities.txt', header=False, index=False, sep='\t')

In [91]:
!head similarities.txt

133565	153305	0.0217391304348
133565	163395	0.02
133565	172618	0.0238095238095
133565	182854	0.0625
133565	183024	0.0181818181818
133565	206749	0.0238095238095
133565	272364	0.0138888888889
133565	594132	0.0169491525424
133565	662946	0.0217391304348
133565	712373	0.0909090909091


In [98]:
cl='147766	154793	175575	181786	182620	196952	142398	145798	147361	150152	150538	151493	151619	152063	153011	154814	155606	159205	166707	169853	169931	170330	172828	173746	174204	174571	175137	177154	178553	180965	190051	194789	147587	175772	178406	184593	186111	187439	143388	151819	168995	169292	170909	178625	179832	192371	195635	196154	178589	186216	179648	143487	144105	144288	148016	148947	164568	176089	183067	185840'
items = map(int, cl.split('\t'))
for i in items:
    print(df[df['itemID'] == i][:1].NAME)

18840    Пенелопа и Одиссей. «Жди меня…» (вес: 250гр.)
Name: NAME, dtype: object
7728    Тельняшка Джек
Name: NAME, dtype: object
3354    История Власа - лентяя и лоботряса (вес: 50гр.)
Name: NAME, dtype: object
8192    Храбрая Дракоша. Как защитить свой леденец
Name: NAME, dtype: object
7088    Хорошо, что есть я! (вес: 468гр.)
Name: NAME, dtype: object
18280    Записки prostitutki Ket
Name: NAME, dtype: object
4910    Древнегреческие мифы. Подвиги Геракла
Name: NAME, dtype: object
6589    Заколдованная принцесса
Name: NAME, dtype: object
3656    Русская рыбалка
Name: NAME, dtype: object
7679    Происшествие на кладбище Пер-Лашез
Name: NAME, dtype: object
2603    Мой дедушка - памятник (с продолжением)
Name: NAME, dtype: object
6647    Басни (вес: 397гр.)
Name: NAME, dtype: object
6254    Огненный бог Марранов
Name: NAME, dtype: object
7145    Древнегреческие мифы. Троянская война (вес: 35...
Name: NAME, dtype: object
3353    Научные забавы и занимательные опыты (вес: 235...
Name: NAM