In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, Dataset
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
users = pd.read_csv('./data/' + 'users.csv')
books = pd.read_csv('./data/' + 'books.csv')
train = pd.read_csv('./data/' + 'train_ratings.csv')
test = pd.read_csv('./data/' + 'test_ratings.csv')
sub = pd.read_csv('./data/' + 'sample_submission.csv')

In [3]:
test_df = train.merge(users, how="left", on='user_id')
test_df = test_df.merge(books, how="left", on='isbn')

In [7]:
test_df[test_df['rating']==1]['user_id'].nunique() / len(set(test_df['user_id']))

0.12735147066200692

In [67]:
pd.set_option('display.max_rows', 50)

In [63]:
test_df.groupby('isbn')['rating'].count().sort_values(ascending=False)

isbn
0316666343    566
0971880107    465
0385504209    390
0312195516    307
0060928336    256
             ... 
0520060199      1
0520059808      1
0520058852      1
0520058763      1
B000234N3A      1
Name: rating, Length: 129777, dtype: int64

In [69]:
one_df = test_df[test_df['rating']==1]
one_book_df = one_df.groupby('isbn')['user_id'].count()

In [70]:
one_book_df

isbn
0002231352    1
0002237857    1
0002243016    1
0002243962    1
0002550563    1
             ..
9972847012    1
9992059958    1
9999980538    1
B0000C7BNG    1
B0000T6KIM    1
Name: user_id, Length: 10613, dtype: int64

In [27]:
test_df[test_df['isbn']=='0971880107']['rating'].value_counts()

1     244
6      47
2      34
7      34
5      27
8      27
4      20
3      15
9      11
10      6
Name: rating, dtype: int64

In [29]:
test_df[test_df['isbn']=='0971880107']['rating'].count()

465

In [30]:
244/465 #'0971880107'책에 대해 52프로가 1점을 주었다.

0.524731182795699

In [50]:
test_df[(test_df['isbn']=='044109418X') & (test_df['rating']==1)]

Unnamed: 0,user_id,isbn,rating,location,age,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
201902,186850,044109418X,1,"wixom, michigan, usa",,Castle Perilous,John De Chancie,1988.0,Ace Books,http://images.amazon.com/images/P/044109418X.0...,,,,images/044109418X.01.THUMBZZZ.jpg


In [51]:
test_df[test_df['user_id']==186850][['isbn','rating','category','book_author']].sort_values(by=['rating'], ascending=True)

Unnamed: 0,isbn,rating,category,book_author
201902,044109418X,1,,John De Chancie
243989,0425151905,2,,Joseph Olshan
247240,0671578073,2,,James P. Hogan
18543,0345369947,6,['Fiction'],Anne Rice


In [87]:
one_df = test_df[test_df['rating']==1]
one_book_df = one_df.groupby('isbn')['user_id'].count()

In [89]:
one_book_df

isbn
0002231352    1
0002237857    1
0002243016    1
0002243962    1
0002550563    1
             ..
9972847012    1
9992059958    1
9999980538    1
B0000C7BNG    1
B0000T6KIM    1
Name: user_id, Length: 10613, dtype: int64

In [101]:
count_df = test_df.groupby('isbn')['rating'].count()

In [102]:
count_df

isbn
0000913154    1
000104687X    1
0001047213    1
0001047973    2
000104799X    1
             ..
B0001FZGBC    1
B0001FZGPI    1
B0001FZGRQ    1
B0001GMSV2    2
B000234N3A    1
Name: rating, Length: 129777, dtype: int64

In [107]:
# count_df = count_df.to_frame()
count_df[count_df['rating'] <= 1].count()

rating    88392
dtype: int64

In [91]:
one_ratio_df = one_book_df.to_frame().merge(count_df, how='left', on='isbn')

Unnamed: 0_level_0,user_id,rating
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
0002231352,1,1
0002237857,1,1
0002243016,1,1
0002243962,1,3
0002550563,1,3
...,...,...
9972847012,1,1
9992059958,1,1
9999980538,1,1
B0000C7BNG,1,1


In [92]:
one_ratio_df['ratio'] = one_ratio_df['user_id']/one_ratio_df['rating']

In [99]:
one_ratio_df.sort_values(by=['ratio'], ascending = True)

Unnamed: 0_level_0,user_id,rating,ratio
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0743418174,1,188,0.005319
0345370775,1,160,0.006250
0804106304,1,156,0.006410
0345361792,1,145,0.006897
059035342X,2,251,0.007968
...,...,...,...
0590448811,1,1,1.000000
0590446681,1,1,1.000000
0590443925,1,1,1.000000
059042856X,1,1,1.000000


In [100]:
test_df[test_df['isbn']=='0590448811']

Unnamed: 0,user_id,isbn,rating,location,age,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
195120,65398,590448811,1,"milford, michigan, usa",13.0,The Christmas Tree That Ate My Mother,Dean Marney,1992.0,Scholastic Paperbacks (Mm),http://images.amazon.com/images/P/0590448811.0...,en,['Christmas stories'],"Looked on by her parents as a bit of a flake, ...",images/0590448811.01.THUMBZZZ.jpg
