# Community Detection

<hr style="border:2px solid black"> </hr>

## Notebook 04 - Backfill small portion of missing community assignments


Some recipes only had neighboring recipes with less than 8% shared users, which were not considered edges in the graph analysis. This heuristic assigns labels to these recipes based on a KNN-like algorithm.

---

### Import libraries

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time

In [2]:
def num_uniques(ser):
    try:
        return len(ser.unique())
    except:
        return "Not unique check-able"

In [3]:
def summarize_df(df):
    print("======DATA SUMMARY======")
    print("{} rows by {} columns".format(df.shape[0], df.shape[1]))
    print("\n======COLUMNS======")
    print(df.dtypes)
    print("\n======PREVIEW======")
    display(df.head())
    print("\n======NUMERICAL COL SUMMARY======")
    print(df.describe())
    print("\n")
    for col in df.columns:
        print("{}: {} unique values".format(col, num_uniques(df[col])))

---
### Import data

In [4]:
%%time
print(time.asctime(time.localtime(time.time())))
shared_users = pd.read_parquet('clustering_data_filtered')
summarize_df(shared_users)

Sun Apr 17 20:41:28 2022
118101546 rows by 3 columns

recipe_id        int64
recipe_id_2      int64
pct_users      float64
dtype: object



Unnamed: 0,recipe_id,recipe_id_2,pct_users
0,0,1118,0.166667
1,60,1118,0.125
2,1066,1118,0.166667
3,1086,1118,0.1
4,0,4446,0.066667



          recipe_id   recipe_id_2     pct_users
count  1.181015e+08  1.181015e+08  1.181015e+08
mean   5.919704e+04  1.186249e+05  8.935996e-02
std    4.195803e+04  4.210507e+04  7.323516e-02
min    0.000000e+00  1.400000e+01  8.756567e-04
25%    2.388100e+04  8.900700e+04  3.571429e-02
50%    5.175700e+04  1.260040e+05  7.142857e-02
75%    8.903400e+04  1.541630e+05  1.250000e-01
max    1.782560e+05  1.782620e+05  6.666667e-01


recipe_id: 85366 unique values
recipe_id_2: 85345 unique values
pct_users: 12414 unique values
CPU times: user 17.3 s, sys: 12.3 s, total: 29.5 s
Wall time: 20.8 s


In [5]:
%%time
print(time.asctime(time.localtime(time.time())))
recipes = pd.read_csv('../01_DataCleansing/users_in_count2_mean4.csv') \
            .groupby('recipe_id').first() \
            .reset_index() \
            .drop(['user_id', 'ratings'], axis=1)
summarize_df(recipes)

Sun Apr 17 20:41:49 2022
85519 rows by 1 columns

recipe_id    int64
dtype: object



Unnamed: 0,recipe_id
0,0
1,2
2,4
3,5
4,7



           recipe_id
count   85519.000000
mean    88101.463967
std     51286.277906
min         0.000000
25%     43529.500000
50%     87890.000000
75%    132326.500000
max    178262.000000


recipe_id: 85519 unique values
CPU times: user 157 ms, sys: 61.8 ms, total: 219 ms
Wall time: 229 ms


In [6]:
%%time
print(time.asctime(time.localtime(time.time())))
asgn = pd.read_parquet('community_assignments_pctthresh-08_split.parquet') \
            .rename({'recipe_id': 'id'}, axis=1)
summarize_df(asgn)

Sun Apr 17 20:41:49 2022
83360 rows by 3 columns

index           int64
id              int64
community_id    int64
dtype: object



Unnamed: 0,index,id,community_id
0,0,153600,0
1,1,75776,0
2,2,59397,0
3,3,18439,0
4,4,79879,0



              index             id  community_id
count  83360.000000   83360.000000  83360.000000
mean   18376.521557   88155.120250     26.944146
std    27901.295573   51262.963776     12.463246
min        0.000000       0.000000      0.000000
25%     1894.000000   43611.750000     16.000000
50%     3789.000000   87955.000000     32.000000
75%    29453.250000  132334.250000     36.000000
max    83359.000000  178262.000000     40.000000


index: 30150 unique values
id: 83360 unique values
community_id: 38 unique values
CPU times: user 26.8 ms, sys: 6.58 ms, total: 33.4 ms
Wall time: 30.2 ms


In [7]:
all_recipes = recipes.merge(asgn, left_on='recipe_id', right_on='id', how='left')
na_recipes = all_recipes[all_recipes['community_id'].isna()]

In [8]:
%%time
print(time.asctime(time.localtime(time.time())))
top_pairs_1 = shared_users.merge(asgn, left_on='recipe_id_2', right_on='id') \
                    .sort_values(by=['recipe_id', 'pct_users'], ascending=False) \
                    .drop('id', axis=1) \
                    .groupby('recipe_id').head(10)
top_pairs_2 = shared_users.merge(asgn, left_on='recipe_id', right_on='id') \
                    .sort_values(by=['recipe_id_2', 'pct_users'], ascending=False) \
                    .drop('id', axis=1) \
                    .groupby('recipe_id_2').head(10)
top_pairs = pd.concat([top_pairs_1, top_pairs_2.rename({'recipe_id_2':'recipe_id', 'recipe_id':'recipe_id_2'}, axis=1)]) \
                    .groupby(['recipe_id', 'recipe_id_2', 'pct_users']).first().reset_index()

Sun Apr 17 20:41:49 2022
CPU times: user 2min 9s, sys: 1min 22s, total: 3min 31s
Wall time: 3min 47s


In [9]:
%%time
print(time.asctime(time.localtime(time.time())))
nearest_10 = top_pairs[['recipe_id', 'community_id', 'recipe_id_2']].groupby(['recipe_id', 'community_id']).count() \
                                                    .reset_index() \
                                                    .sort_values(by='recipe_id_2', ascending=False) \
                                                    .drop('recipe_id_2', axis=1) \
                                                    .groupby('recipe_id').first().reset_index()

Sun Apr 17 20:45:37 2022
CPU times: user 292 ms, sys: 98.3 ms, total: 390 ms
Wall time: 390 ms


In [10]:
na_asgn = na_recipes.merge(nearest_10, on='recipe_id') \
                            .rename({'community_id_y':'community_id'}, axis=1) \
                            .drop(['community_id_x', 'id'],axis=1)

In [11]:
gapfill_asgn = pd.concat([all_recipes[~all_recipes['community_id'].isna()].drop('id', axis=1).astype({'community_id':'int64'}),na_asgn])

In [12]:
# Confirm samesize db out
print(gapfill_asgn.shape)
print(all_recipes.shape)

(85519, 3)
(85519, 4)


In [13]:
len(asgn['id'].unique())

83360

In [14]:
len(gapfill_asgn['recipe_id'].unique())

85519

In [15]:
%%time
print(time.asctime(time.localtime(time.time())))
gapfill_asgn[['recipe_id', 'community_id']].to_parquet('community_assignments_pctthresh-08_split_filled.parquet')

Mon Apr 11 16:53:13 2022
CPU times: user 25.2 ms, sys: 16.8 ms, total: 42 ms
Wall time: 52.9 ms


In [17]:
gapfill_asgn_gb = gapfill_asgn[['recipe_id', 'community_id']].groupby('community_id', as_index=False).count() \
                                                                .rename({'recipe_id':'count'}, axis=1)

In [19]:
gapfill_asgn_gb.to_csv('node_sizes.csv')

In [24]:
gapfill_asgn_gb_filt = gapfill_asgn_gb[gapfill_asgn_gb['count'] > 40]

In [28]:
gapfill_asgn_filt = gapfill_asgn.merge(gapfill_asgn_gb_filt[['community_id']], on='community_id', how='inner')

In [31]:
%%time
print(time.asctime(time.localtime(time.time())))
gapfill_asgn_filt[['recipe_id', 'community_id']].to_parquet('../05_RecipeExplorationTool/data/community_assignments_pctthresh-08_split_filled_filter.parquet')

Sun Apr 17 21:09:57 2022
CPU times: user 31.7 ms, sys: 12.2 ms, total: 43.9 ms
Wall time: 42.5 ms
