### Example
- rating dataset
- https://www.kaggle.com/rounakbanik/the-movies-dataset/data
    - ratings_small.csv

In [1]:
import recommend

In [2]:
rating_df = pd.read_csv("ratings_small.csv")
rating_df["movieId"] = rating_df["movieId"].astype("str")
rating_df.tail()

Unnamed: 0,userId,movieId,rating,timestamp
99999,671,6268,2.5,1065579370
100000,671,6269,4.0,1065149201
100001,671,6365,4.0,1070940363
100002,671,6385,2.5,1070979663
100003,671,6565,3.5,1074784724


In [3]:
unique_user = rating_df["userId"].unique()
unique_movie = rating_df["movieId"].unique()
unique_rating = rating_df["rating"].unique()
unique_rating = sorted(unique_rating)
print("sorted rating : {}".format(unique_rating))
print(
    "user:", len(unique_user), 
    "movie:", len(unique_movie), 
    "rating:", len(unique_rating),
)

sorted rating : [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0]
user: 671 movie: 9066 rating: 10


- 별점 분포

In [4]:
rating_df.groupby("rating").size().reset_index(name='rating_counts')

Unnamed: 0,rating,rating_counts
0,0.5,1101
1,1.0,3326
2,1.5,1687
3,2.0,7271
4,2.5,4449
5,3.0,20064
6,3.5,10538
7,4.0,28750
8,4.5,7723
9,5.0,15095


In [5]:
user_counts_df = rating_df.groupby("userId").size().reset_index(name='user_rating_count')
user_counts_df = user_counts_df.sort_values(by=['user_rating_count'], ascending=False)
user_counts_df.tail()

Unnamed: 0,userId,user_rating_count
295,296,20
288,289,20
248,249,20
220,221,20
0,1,20


In [6]:
movie_counts_df = rating_df.groupby("movieId").size().reset_index(name='movie_rating_count')
movie_counts_df = movie_counts_df.sort_values(by=['movie_rating_count'], ascending=False)
movie_counts_df.tail()

Unnamed: 0,movieId,movie_rating_count
4877,46855,1
4880,4687,1
4884,4691,1
4885,46919,1
9065,99992,1


- preprocessing dataframe

In [7]:
user_limit, movie_limit = 200, 100

In [8]:
# user_limit번 이상 평가한 UserId
filtered_userId = list(user_counts_df[user_counts_df["user_rating_count"] > user_limit]["userId"])
len(filtered_userId)

137

In [9]:
# movie_limit개 이상 평가 받은 movieId
filtered_movieId = list(movie_counts_df[movie_counts_df["movie_rating_count"] > movie_limit]["movieId"])
len(filtered_movieId)

149

In [10]:
# filtering userId
filterd_df = rating_df[rating_df['userId'].isin(filtered_userId)] 

# filtering movieId
filterd_df = filterd_df[filterd_df['movieId'].isin(filtered_movieId)] 

print(len(filterd_df))
filterd_df.tail()

10549


Unnamed: 0,userId,movieId,rating,timestamp
99660,665,4306,5.0,1010197453
99679,665,4896,5.0,1010197308
99681,665,4993,5.0,1046967408
99685,665,5445,3.0,1046967549
99690,665,5952,5.0,1046967408


In [11]:
filterd_df["movieId"] = filterd_df["movieId"].astype("str")

In [12]:
user_df = filterd_df.pivot_table(values="rating", index=["userId"], columns=["movieId"],\
                            aggfunc=np.average, fill_value=0, dropna=False)
user_df.tail()

movieId,1,10,1036,1073,1089,1097,110,111,1136,1193,...,7438,750,778,780,79132,858,8961,912,919,924
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
648,0.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,4.0,0.0,...,3.0,4.0,5.0,2.0,0.0,3.5,4.0,4.0,0.0,0.0
652,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0
654,5.0,4.0,4.5,3.5,5.0,5.0,4.5,0.0,5.0,3.5,...,4.0,0.0,0.0,4.0,0.0,4.5,5.0,0.0,5.0,4.0
664,3.5,0.0,4.0,0.0,4.5,0.0,4.0,4.0,4.5,0.0,...,4.0,0.0,4.5,3.5,5.0,4.0,4.0,0.0,0.0,4.0
665,0.0,0.0,3.0,5.0,0.0,5.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,4.0,0.0


In [13]:
DR = recommend.DssRecommend(user_df)

In [14]:
euclidean_sm = DR.similarity_matrix("euclidean")
euclidean_sm.head()

userId,4,15,17,19,22,23,30,48,56,57,...,605,607,608,615,624,648,652,654,664,665
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,12.308534,8.01561,6.557439,6.422616,7.713624,7.81025,5.477226,8.660254,6.63325,...,7.681146,9.205976,6.082763,5.291503,8.703448,7.697402,2.236068,4.690416,5.220153,7.681146
15,12.308534,0.0,14.256577,11.0,13.35103,15.572412,17.31329,13.0,15.099669,9.26013,...,9.88686,18.330303,9.233093,14.071247,14.866069,13.856406,8.717798,21.142375,14.40486,14.3527
17,8.01561,14.256577,0.0,8.0,8.440972,9.26013,11.07926,7.36546,10.307764,5.545268,...,8.703448,9.66954,5.385165,8.972179,9.810708,9.082951,5.656854,11.989579,8.42615,9.420722
19,6.557439,11.0,8.0,0.0,6.745369,6.184658,9.539392,5.361903,8.306624,6.480741,...,7.681146,9.407444,5.744563,4.0,7.245688,6.480741,3.354102,9.124144,5.894913,7.483315
22,6.422616,13.35103,8.440972,6.745369,0.0,9.069179,9.137833,5.196152,7.858117,5.567764,...,6.763875,8.631338,4.444097,5.830952,8.717798,7.745967,3.464102,9.486833,6.22495,7.035624


In [15]:
euclidean_sm = DR.similarity_matrix("cosin")
euclidean_sm.head()

userId,4,15,17,19,22,23,30,48,56,57,...,605,607,608,615,624,648,652,654,664,665
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.0,0.956762,0.948457,0.985932,0.981591,0.982744,0.979119,0.994869,0.971252,0.970303,...,0.973735,0.97577,0.974286,0.995005,0.975817,0.951088,0.986368,0.991149,0.994377,0.968998
15,0.956762,1.0,0.914953,0.950125,0.906975,0.923247,0.920392,0.905467,0.910803,0.950249,...,0.92495,0.90304,0.957091,0.913701,0.933455,0.856947,0.893839,0.917356,0.930106,0.903008
17,0.948457,0.914953,1.0,0.949537,0.939038,0.961024,0.94202,0.955486,0.936368,0.968238,...,0.90265,0.951974,0.980561,0.953481,0.948541,0.933889,0.869626,0.947757,0.964792,0.933463
19,0.985932,0.950125,0.949537,1.0,0.955135,0.980127,0.962846,0.972344,0.954495,0.968434,...,0.942949,0.955904,0.977367,0.986879,0.973056,0.9665,0.980166,0.979269,0.979273,0.95424
22,0.981591,0.906975,0.939038,0.955135,1.0,0.953184,0.951718,0.974718,0.948308,0.947654,...,0.921494,0.963369,0.97578,0.976946,0.958532,0.916472,0.941138,0.971286,0.978374,0.953578


In [16]:
DR.auto()

In [17]:
DR.recommand_matrix().head()

Unnamed: 0,recommend
4,"924, 1193, 3147, 2396, 3578, 3996, 4226, 4973,..."
15,
17,"1200, 1097, 750, 1, 1291, 589, 592, 1214, 34, ..."
19,"3147, 6874, 2959, 2571, 58559, 7153, 2028, 488..."
22,"2791, 1197, 1961, 223, 2797, 349, 34, 1393, 12..."


In [18]:
DR.recommand_user(4)[:5]

['924', ' 1193', ' 3147', ' 2396', ' 3578']

In [19]:
DR.evaluate()

{'mae': 0.6666247801494789,
 'mse': 0.8095053475304163,
 'rmse': 0.1755369201585486}

In [20]:
# find variable

In [21]:
DR = recommend.DssRecommend(user_df)

In [22]:
similarity_list = ["euclidean", "cosin"]
close_counts = range(5,10)

for similarity in similarity_list:
    for close_count in close_counts:
        DR.pred_matrix(similarity, close_count)
        print(similarity, close_count, DR.evaluate())

euclidean 5 {'mse': 1.958004721148223, 'rmse': 0.16990640390502953, 'mae': 1.1450983489519546}
euclidean 6 {'mse': 1.8512923271991912, 'rmse': 0.16470380815853844, 'mae': 1.117020128560687}
euclidean 7 {'mse': 1.7616534174622374, 'rmse': 0.16055859464089606, 'mae': 1.0911226814847077}
euclidean 8 {'mse': 1.71293484056357, 'rmse': 0.15830857729165101, 'mae': 1.0770459482535395}
euclidean 9 {'mse': 1.6591479948748014, 'rmse': 0.15556421570802964, 'mae': 1.059680470363245}
cosin 5 {'mse': 0.7908924890148493, 'rmse': 0.12138051728213664, 'mae': 0.6735729275911386}
cosin 6 {'mse': 0.7771995599000571, 'rmse': 0.11405603159302144, 'mae': 0.6695647644826646}
cosin 7 {'mse': 0.7618588991063704, 'rmse': 0.10901807785284866, 'mae': 0.6640732694533402}
cosin 8 {'mse': 0.7383027699774652, 'rmse': 0.104917890656866, 'mae': 0.6587337784414438}
cosin 9 {'mse': 0.7283127627191636, 'rmse': 0.10225708931259929, 'mae': 0.656816671272467}
