In [1]:
import warnings
warnings.filterwarnings("ignore")

## Importing Libraries

In [28]:
import cudf
import cupy
from cuml.feature_extraction.text import TfidfVectorizer
from cuml.neighbors import NearestNeighbors

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
print(f"CuDF: {cudf.__version__}")

CuDF: 0.16.0


In [5]:
plt.rcParams["figure.figsize"] = (15, 12)
sns.set_style("darkgrid")

In [29]:
pd.set_option("display.max_colwidth", None)

## Nearest Neighbors

Check if for cases with similar reviews, what's the score distribution

In [6]:
df = cudf.read_csv("../input/intermediate-food-reviews/food_reviews.csv")

In [7]:
df.shape

(395003, 5)

In [8]:
df["Summary"] = df["Summary"].str.strip().str.capitalize()

In [9]:
df.head()

Unnamed: 0,ProductId,UserId,ProfileName,Score,Summary
0,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,5,Good quality dog food
1,B00813GRG4,A1D87F6ZCVE5NK,dll pa,1,Not as advertised
2,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",4,"""delight"" says it all"
3,B000UA0QIQ,A395BORC6FGVXV,Karl,2,Cough medicine
4,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",5,Great taffy


In [31]:
tfidf = TfidfVectorizer(stop_words='english', binary=True)
summary_embeddings = tfidf.fit_transform(df["Summary"].head(50000)).toarray()

In [32]:
print(f"Embeddings Shape: {summary_embeddings.shape}")

Embeddings Shape: (50000, 10950)


In [37]:
num_neighbors = 100
knn = NearestNeighbors(num_neighbors)

In [38]:
knn.fit(summary_embeddings)

NearestNeighbors(n_neighbors=100, verbose=4, handle=<cuml.raft.common.handle.Handle object at 0x7f3a3b455c50>, algorithm='brute', metric='euclidean', p=2, metric_params=None, output_type='cupy')

In [39]:
distances, indices = knn.kneighbors(summary_embeddings)

In [44]:
# evaluating random 10 samples
for k in cupy.random.randint(low=0, high=50000, size=10):
    print(df.loc[cupy.asnumpy(indices[k,:10]), ["Summary", "Score"]], end="\n\n")

     Summary  Score
279    Yummy      5
28    Yummy!      5
260    Yummy      5
663    Yummy      5
770   Yummy!      5
2834  Yummy!      5
3079  Yummy!      5
2690   Yummy      5
2844   Yummy      5
3722   Yummy      5

              Summary  Score
1554      Hot hot hot      5
7172              Hot      1
19074      Not so hot      3
17689      Not so hot      2
27802  Hot, hot, hot!      5
32528  Hot! hot! hot!      5
7485        Very hot!      5
315              Hot!      5
38177    It is so hot      5
10473          Hot!!!      5

                                                      Summary  Score
9789   Totally awesome snack - low fat/high protein.  yum yum      5
22079                                   High protein low fat!      5
45255                            Awesome, high protein snack!      5
35134                          High protein low calorie snack      5
31219                                    Great low-fat snack!      5
17377           High protein, low fat, tast g