In [1]:
import numpy as np
import pandas as pd
import os
import sys
import pickle
import time
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from importlib import reload
%matplotlib inline
from IPython.core.display import display, HTML, clear_output
display(HTML("<style>.container { width:80% !important; }</style>")) 

# Load Embeddings Data

In [2]:
cwd = os.getcwd()
embeddings = pd.read_pickle(os.path.join(cwd, "..", "data", "movie_embeddings_1.pkl"))
movie_id = embeddings.index.values.tolist()
print(embeddings.shape)
embeddings.head()

(26744, 100)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,1.362656,1.223114,0.158125,0.098262,-0.511146,0.751987,-0.083907,-0.655189,0.124555,0.398819,...,0.252393,-0.72638,0.095431,0.080516,-0.01616,0.231056,-0.469438,0.385305,0.046974,0.750637
1,-2.101079,1.202782,-1.978461,-0.782652,-0.233708,-2.028353,-0.797505,2.007848,-1.210389,-0.513851,...,-0.320754,0.995709,0.110728,-0.372982,-1.763203,-1.167182,-0.712131,0.787086,0.321424,-1.016185
2,-0.739998,1.639906,-0.821546,-0.810773,0.262396,-2.618929,-1.665061,1.418108,-0.288657,-0.164132,...,-0.749115,0.654003,0.201651,0.267222,-0.420872,0.687322,0.186,2.218797,-0.221609,-0.206429
3,0.23362,1.345427,0.200785,-1.054063,-0.793839,-2.866071,-0.859098,2.171351,-1.371101,-0.124353,...,-0.355255,0.351026,0.25468,-0.528827,-0.650816,-0.855221,0.356243,1.785845,-1.214038,-0.027784
4,-0.642298,2.164207,-0.219435,-0.764872,-0.822317,-3.844967,-0.121182,3.398263,-1.629255,-0.188076,...,-1.022821,0.926114,-0.006677,-1.271328,-0.895705,-0.809579,1.378056,0.476175,-0.452644,-0.829564


In [3]:
# Load index mappung 
with open('../data/movie_to_idx.pkl', 'rb') as handle:
    movie2idx = pickle.load(handle)

In [4]:
movies = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))
print("{} unique movies in movies.csv".format(len(movies.movieId.unique())))

ratings = pd.read_csv(os.path.join(cwd, "..", "data", "ratings.csv"))
print("{} unique movies in ratings.csv".format(len(ratings.movieId.unique())))

movies = pd.merge(movies, ratings, on="movieId", how="inner")
movies.movieId = movies.movieId.apply(lambda x: movie2idx[x])

#get popularity
popularity = pd.DataFrame(movies[['userId', 'title', 'movieId']].groupby(['title', 'movieId']).agg(['count']))
popularity.reset_index(inplace=True)
popularity.columns = ['title', 'movieId', 'ratings_count']
popularity.sort_values('ratings_count', ascending=False, inplace=True)
movies = pd.merge(popularity[['movieId', 'ratings_count']], movies, on='movieId')
movies.reset_index(inplace=True)

#get average ratings
average_ratings = pd.DataFrame(movies[['rating', 'title', 'movieId']].groupby(['title', 'movieId']).agg(['mean']))
average_ratings.reset_index(inplace=True)
average_ratings.columns = ['title', 'movieId', 'avg_rating']
movies = pd.merge(average_ratings[['movieId', 'avg_rating']], movies, on='movieId')
movies.reset_index(inplace=True)


movies = movies[['movieId', 'title', 'genres', 'ratings_count', 'avg_rating']]
movies.drop_duplicates(inplace=True)
print("{} unique movies in embeddings".format(len(movies.movieId.unique())))
movies.set_index('movieId', inplace=True, drop=True)
movies.sort_index(ascending=True, inplace=True)
print(movies.shape)
movies.head(5)

27278 unique movies in movies.csv
26744 unique movies in ratings.csv
26744 unique movies in embeddings
(26744, 4)


Unnamed: 0_level_0,title,genres,ratings_count,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Jumanji (1995),Adventure|Children|Fantasy,22243,3.211977
1,"City of Lost Children, The (Cité des enfants p...",Adventure|Drama|Fantasy|Mystery|Sci-Fi,8520,3.95223
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery|Sci-Fi|Thriller,44980,3.898055
3,Seven (a.k.a. Se7en) (1995),Mystery|Thriller,43249,4.053493
4,"Usual Suspects, The (1995)",Crime|Mystery|Thriller,47006,4.334372


In [5]:
movies.query('title == "Zodiac (2007)"')

Unnamed: 0_level_0,title,genres,ratings_count,avg_rating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3995,Zodiac (2007),Crime|Drama|Thriller,3907,3.675454


# Load Tags Data

In [6]:
cwd = os.getcwd()
tags = pd.read_csv(os.path.join(cwd, "..", "data", "tags.csv"))
tags.drop('timestamp', axis=1, inplace=True)
tags['tag'] = tags['tag'].astype(str)

tags = pd.DataFrame(tags.groupby('movieId')['tag'].apply(lambda x: "{%s}" % ' '.join(x)))
tags.reset_index(inplace=True)

In [7]:
movies_tags = pd.read_csv(os.path.join(cwd, "..", "data", "movies.csv"))
movies_tags.set_index('movieId', inplace=True)
movies_tags['genres'] = movies_tags['genres'].str.replace(pat="|", repl=" ")
movies_tags['genres'] = movies_tags['genres'].str.replace(pat="-", repl="")

In [8]:
# add genres
tags = pd.merge(movies_tags, tags, left_index=True, right_on='movieId', how='right')
tags['document'] = tags[['tag', 'genres']].apply(lambda x: ' '.join(x), axis=1)
tags.head(3)

Unnamed: 0,title,genres,movieId,tag,document
0,Toy Story (1995),Adventure Animation Children Comedy Fantasy,1,{Watched computer animation Disney animated fe...,{Watched computer animation Disney animated fe...
1,Jumanji (1995),Adventure Children Fantasy,2,{time travel adapted from:book board game chil...,{time travel adapted from:book board game chil...
2,Grumpier Old Men (1995),Comedy Romance,3,{old people that is actually funny sequel feve...,{old people that is actually funny sequel feve...


In [9]:

#map index properly
print("{} unique movies in ratings.csv".format(len(ratings.movieId.unique())))
tags = pd.merge(tags, ratings, on="movieId", how="right")
tags = tags[['movieId', 'title', 'genres', 'tag', 'document']]
tags.drop_duplicates('movieId', inplace=True)
tags.movieId = tags.movieId.apply(lambda x: movie2idx[x])
print("There are {} unique movies in tags".format(len(tags.movieId)))
print(tags.dtypes)
tags.set_index('movieId', inplace=True, drop=True)
tags.sort_index(ascending=True, inplace=True)

tags.fillna(value=" ", inplace=True)
tags.head()

26744 unique movies in ratings.csv
There are 26744 unique movies in tags
movieId      int64
title       object
genres      object
tag         object
document    object
dtype: object


Unnamed: 0_level_0,title,genres,tag,document
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Jumanji (1995),Adventure Children Fantasy,{time travel adapted from:book board game chil...,{time travel adapted from:book board game chil...
1,"City of Lost Children, The (Cité des enfants p...",Adventure Drama Fantasy Mystery SciFi,{children Santa Claus dystopia abused children...,{children Santa Claus dystopia abused children...
2,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),Mystery SciFi Thriller,{post-apocalyptic psychology time travel Brad ...,{post-apocalyptic psychology time travel Brad ...
3,Seven (a.k.a. Se7en) (1995),Mystery Thriller,{atmospheric psychology serial killer BFI mode...,{atmospheric psychology serial killer BFI mode...
4,"Usual Suspects, The (1995)",Crime Mystery Thriller,{organized crime twist ending twists & turns t...,{organized crime twist ending twists & turns t...


# TFIDF and SVD for Tags
Compress TFIDF matrix with SVD to find latent topics in the text. 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(ngram_range=(1,2),
                        min_df=0.001,
                        stop_words='english')
tfidf_matrix = tfidf.fit_transform(tags.document)
print(tfidf_matrix.shape)

(26744, 2795)


In [39]:
with open('../data/tfidf_matrix.pkl', 'wb') as handle:
    pickle.dump(tfidf_matrix, handle)

In [38]:
tfidf_matrix_df = pd.DataFrame(tfidf_matrix.toarray())
tfidf_matrix_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2785,2786,2787,2788,2789,2790,2791,2792,2793,2794
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
tfidf.vocabulary_

{'time': 2528,
 'travel': 2572,
 'adapted': 166,
 'book': 461,
 'game': 1191,
 'childhood': 592,
 'cgi': 566,
 'animals': 251,
 'scary': 2229,
 'fantasy': 1050,
 'robin': 2166,
 'williams': 2738,
 'joe': 1513,
 'kid': 1571,
 'flick': 1122,
 'jungle': 1547,
 'adventure': 182,
 'children': 594,
 'dynamic': 951,
 'action': 139,
 'childish': 593,
 'filmed': 1110,
 'bad': 332,
 'based': 350,
 'chris': 602,
 'van': 2640,
 'magic': 1680,
 'saturn': 2225,
 'award': 324,
 'best': 390,
 'special': 2365,
 'effects': 976,
 'supporting': 2453,
 'actress': 161,
 'clv': 646,
 'horror': 1375,
 'genre': 1208,
 'kids': 1575,
 'time travel': 2529,
 'adapted book': 167,
 'robin williams': 2167,
 'children fantasy': 597,
 'dynamic cgi': 952,
 'cgi action': 567,
 'based book': 351,
 'saturn award': 2226,
 'award best': 325,
 'special effects': 2366,
 'best supporting': 402,
 'supporting actress': 2455,
 'adventure children': 185,
 'dystopia': 956,
 'surreal': 2459,
 'bleak': 442,
 'dark': 794,
 'visually': 

In [12]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100)
latent_matrix = svd.fit_transform(tfidf_matrix)
print(latent_matrix.shape)

explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance by the SVD: {}%".format(int(explained_variance * 100)))

(26744, 100)
Explained variance by the SVD: 38%


# Concatinate latent Tags and Embeddings

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import preprocessing

#normalize ensembeled dimensions
x = np.concatenate((embeddings, latent_matrix), axis=1)
scaler = preprocessing.StandardScaler()
x_scaled = scaler.fit_transform(x)
latent_df = pd.DataFrame(x_scaled)
#similarity from ensemble of embeddigns and tfidf
cosine_sim_ensemble = pd.DataFrame(cosine_similarity(X=latent_df), index=movie_id)
cosine_sim_ensemble.columns = movie_id

In [14]:
cosine_sim_ensemble

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26734,26735,26736,26737,26738,26739,26740,26741,26742,26743
0,1.000000,0.010441,0.140234,-0.040787,0.055763,0.303619,0.016998,0.072690,0.026322,0.161616,...,-0.085495,-0.030364,0.042906,0.057365,-0.224172,-0.177255,0.033764,-0.031725,0.015605,-0.035558
1,0.010441,1.000000,0.394977,0.310834,0.344366,0.095658,-0.024622,0.428942,0.233685,0.147677,...,-0.086837,-0.077002,-0.403863,-0.043808,-0.292007,-0.310443,-0.055490,-0.084479,-0.058493,-0.046490
2,0.140234,0.394977,1.000000,0.491396,0.447479,0.071340,0.007873,0.451274,0.212074,0.113355,...,-0.058296,-0.055909,-0.256576,-0.099636,-0.216593,-0.230224,-0.062803,-0.054072,-0.033267,-0.026876
3,-0.040787,0.310834,0.491396,1.000000,0.703385,-0.060698,0.215039,0.419241,0.334884,0.282502,...,-0.055224,-0.072290,-0.205546,-0.086790,-0.241017,-0.225489,-0.045676,-0.055380,-0.038277,-0.042397
4,0.055763,0.344366,0.447479,0.703385,1.000000,0.031912,0.227207,0.446506,0.268300,0.400017,...,-0.101023,-0.088872,-0.278265,-0.103679,-0.378274,-0.362687,-0.094762,-0.092272,-0.035091,-0.052785
5,0.303619,0.095658,0.071340,-0.060698,0.031912,1.000000,-0.013627,0.117496,0.095659,0.065113,...,-0.071113,-0.046236,-0.106160,-0.051149,-0.213574,-0.137451,-0.035817,-0.022317,-0.011157,-0.006052
6,0.016998,-0.024622,0.007873,0.215039,0.227207,-0.013627,1.000000,-0.086122,0.152291,0.231855,...,-0.088082,-0.067674,-0.184925,-0.042546,-0.327714,-0.290927,-0.071166,-0.060777,-0.045882,-0.033719
7,0.072690,0.428942,0.451274,0.419241,0.446506,0.117496,-0.086122,1.000000,0.295365,0.218956,...,-0.099442,-0.106364,-0.330761,-0.061626,-0.359204,-0.394499,-0.070536,-0.059804,-0.052238,-0.046308
8,0.026322,0.233685,0.212074,0.334884,0.268300,0.095659,0.152291,0.295365,1.000000,0.084549,...,-0.057877,0.029239,-0.178361,0.043189,-0.225294,-0.149579,0.008762,-0.066018,-0.020658,0.007122
9,0.161616,0.147677,0.113355,0.282502,0.400017,0.065113,0.231855,0.218956,0.084549,1.000000,...,-0.042589,-0.044973,-0.056871,-0.060821,-0.193390,-0.156518,-0.040194,-0.038809,-0.026102,-0.023518


In [15]:

#similarity from collab
latent_df = embeddings.copy()
cosine_sim_embeddings = pd.DataFrame(cosine_similarity(X=latent_df), index=movie_id)
cosine_sim_embeddings.columns = movie_id

#similarity from content
latent_df = pd.DataFrame(latent_matrix, index=movie_id)
cosine_sim_tfidf = pd.DataFrame(cosine_similarity(X=latent_df), index=movie_id)
cosine_sim_tfidf.columns = movie_id


# Find Similar Movies in Latent Space

In [16]:
def find_similar_movies(movie_id, cosine_sim, movies):
    
    #get similarity score vector for requested movie
    sim_series = pd.DataFrame(cosine_sim.loc[movie_id].sort_values(ascending=False))
    sim_series.columns = ["sim_score"]
    sim_series.head()
     
    #return detailed dataframe with similarity scores for reuqested movie
    sim_df = pd.merge(movies, sim_series, left_index=True, right_index=True)
    sim_df.sort_values(by="sim_score", ascending=False, inplace=True)
    return sim_df

In [30]:
movie_id = 3006 #primer
movie_id = 1195 #grease
#movie_id = 131 #LOTR
#movie_id = 2087 #inception
movie_id = 3995 #zodiac
#movie_id = 23877 #forgotton (1 rating)
#movie_id = 15816 #rated 19 times

find_similar_movies(movie_id, cosine_sim_ensemble, movies).head(25)

Unnamed: 0,title,genres,ratings_count,avg_rating,sim_score
3995,Zodiac (2007),Crime|Drama|Thriller,3907,3.675454,1.0
2797,"Social Network, The (2010)",Drama,5264,3.843275,0.68654
1423,"Insider, The (1999)",Drama|Thriller,8802,3.931266,0.619956
2829,All the President's Men (1976),Drama|Thriller,4390,4.039066,0.612164
3016,Munich (2005),Action|Crime|Drama|Thriller,4745,3.731718,0.608387
1051,Quiz Show (1994),Drama,20601,3.666691,0.588738
478,Donnie Brasco (1997),Crime|Drama,10585,3.804818,0.58501
2349,Dog Day Afternoon (1975),Crime|Drama,7045,3.983534,0.580087
1836,127 Hours (2010),Adventure|Drama|Thriller,2496,3.740986,0.572146
897,"Aviator, The (2004)",Drama,5842,3.609038,0.567107


In [31]:
sim_embeddings = find_similar_movies(movie_id, cosine_sim_embeddings, movies)
sim_embeddings

Unnamed: 0,title,genres,ratings_count,avg_rating,sim_score
3995,Zodiac (2007),Crime|Drama|Thriller,3907,3.675454,1.000000
2777,"Hurt Locker, The (2008)",Action|Drama|Thriller|War,3965,3.797982,0.852881
2797,"Social Network, The (2010)",Drama,5264,3.843275,0.843333
1913,Collateral (2004),Action|Crime|Drama|Thriller,7895,3.697973,0.836592
18816,"Citadel, The (1938)",Drama,9,3.333333,0.830760
9783,State of Play (2003),Crime|Drama|Mystery,27,3.518519,0.818647
897,"Aviator, The (2004)",Drama,5842,3.609038,0.811267
12976,Powers of Ten (1977),Documentary,11,4.000000,0.804881
3024,Gone Baby Gone (2007),Crime|Drama|Mystery,2817,3.824104,0.797357
2576,Michael Clayton (2007),Drama|Thriller,2934,3.756476,0.791343


In [32]:
sim_tfidf = find_similar_movies(movie_id, cosine_sim_tfidf, movies)
sim_tfidf

Unnamed: 0,title,genres,ratings_count,avg_rating,sim_score
3995,Zodiac (2007),Crime|Drama|Thriller,3907,3.675454,1.000000
7761,Karla (2006),Crime|Drama|Thriller,24,2.583333,0.800021
2396,Awakenings (1990),Drama|Mystery,6711,3.764640,0.780823
15326,Ted Bundy (2002),Crime|Drama|Thriller,39,2.705128,0.768786
1491,Sleepers (1996),Thriller,8740,3.635870,0.759398
1051,Quiz Show (1994),Drama,20601,3.666691,0.746929
12764,Deep Crimson (Profundo carmesí) (1996),Crime|Drama,15,3.400000,0.724743
11686,Burke and Hare (2010),Comedy|Thriller,95,3.231579,0.710049
3346,Without Limits (1998),Drama,400,3.657500,0.686629
1802,21 (2008),Crime|Drama|Romance|Thriller,2157,3.476588,0.682919


In [20]:
#average two sim scores to ensemble
test = pd.merge(sim_tfidf[['title', 'genres', 'sim_score']],
                sim_embeddings[['sim_score', 'avg_rating', 'ratings_count']],
                left_index=True,
                right_index=True)
test['average_sim_score'] = (test.sim_score_x + test.sim_score_y)/2
test.sort_values(by="average_sim_score", ascending=False, inplace=True)
test[['title', 'genres', 'avg_rating', 'ratings_count', 'average_sim_score']]

Unnamed: 0,title,genres,avg_rating,ratings_count,average_sim_score
3006,Primer (2004),Drama|Sci-Fi,3.852630,2643,1.000000
6480,"Timecrimes (Cronocrímenes, Los) (2007)",Sci-Fi|Thriller,3.794245,695,0.831529
125,Donnie Darko (2001),Drama|Mystery|Sci-Fi|Thriller,4.015990,18731,0.800422
3120,Frequently Asked Questions About Time Travel (...,Comedy|Sci-Fi,3.759300,457,0.782261
10397,Dragon Ball Z: The History of Trunks (Doragon ...,Action|Adventure|Animation,3.647059,17,0.751464
5620,"Day of the Doctor, The (2013)",Adventure|Drama|Sci-Fi,3.917266,417,0.748888
2663,Interstellar (2014),Sci-Fi|IMAX,4.023864,1739,0.744144
3071,Looper (2012),Action|Crime|Sci-Fi,3.770403,2855,0.729581
9185,Justice League: The Flashpoint Paradox (2013),Action|Adventure|Animation|Fantasy|Sci-Fi,3.649533,107,0.721463
6484,Triangle (2009),Drama|Horror|Mystery|Thriller,3.576271,295,0.712944


In [21]:
sim_stats = cosine_sim_embeddings.describe()

KeyboardInterrupt: 

In [None]:
sim_df = pd.merge(movies, sim_stats.T, left_index=True, right_index=True)
sim_df

In [None]:
No its the scores for the top 20 recs that should be higher. Not for everything
#so get top 20 recs for each movie
#then plot average scores from top 20 by volume of that movie. 

In [None]:
#Is the sim score for high volume movies always higher?
#No. nothing really happening here.
#SD is higher for high colume movies. Hmmm

plot_data = sim_df.sample(2000)
plot_data = plot_data.sort_values('ratings_count', ascending=False)

g = sns.catplot(x="ratings_count", y="std", data=plot_data)
g.set(xticklabels=[])
plt.show()



In [None]:
#are good movies rated more?
#plot volume by average rating

#not really. Actually the oposite. Lower volume movies have a big higher average ratings. 

plot_data = movies.sample(1000)
plot_data = plot_data.sort_values('ratings_count', ascending=False)

g = sns.catplot(x="ratings_count", y="avg_rating", data=plot_data)
g.set(xticklabels=[])
plt.show()

#TODO a hypothesis test for this?

In [None]:
# to evalaute work, look at 
#average rating of top recs
#average volume of top recs
#maybe a cool distribution plot from seaborn to show it. 

In [None]:
#basedd on primer results, it looks to me like recs with ratings less than like 100ish are totaly irrelevant trash
#IS there a way to show this? Should we limit the collaborative filter by that?
#only make collaborative filter recs if volue greater than n?
#Prove collaborative recs only operate in the head, not the tail. 

In [None]:
#plot average standard deviaation in movie sim score by volume.

#When we recommend high volume movie, it is meaningfull
#when we recommend low volume movie, it's all over the palce.
#at what volume do we lose certainty in the sim score?

In [None]:
@would average sim score by volume show it?
#I think so?
Every rec of primer is a low score, so low average sim socre
#every rec of LOTR is high. 

#so mean sim scor eby volume would show recs for low volume items are low score recs.

#but can be show they are uncertain?
#the low volume recs are noise, but how do I show that?