# Creates Dummies files

# Import Dependencies

In [1]:
import pandas as pd
import pickle as pk

In [2]:
# define a formula for calculation of average weighted score
def average_weighted(row):
    min_th=25 #min number of rating received by the book
    neutral_score=5
    avg_w = ((row['avg_book_rating'] * row['count_book_rating']) + 
      (min_th * neutral_score))/(row['count_book_rating'] + min_th)
    return avg_w

In [3]:
# having min_books_rated_by_user and min_rates_received_by_book to define what we treat as statistically significant
# remove those records from ratings_df, which have those books with less than min_rates_received_by_book reviews and those users who have left less than min_books_rated_by_user reviews
def leave_stat_sign_data(ratings_df, min_books_rated_by_user=5,min_rates_received_by_book=5):
    #select only those books which were rated more than min_rates_received_by_book
    groupped_r_books=ratings_df.groupby('Book-Title')['User-ID'].count()
    titles_with_acceptable_rates_count=list(groupped_r_books[groupped_r_books>min_rates_received_by_book].index)
    #select only those users (user_id) who rated more than min_books_rated_by_user books
    groupped_r_users=ratings_df.groupby('User-ID')['Book-Rating'].count()
    user_ids_with_acceptable_books_count_rated=list(groupped_r_users[groupped_r_users>min_books_rated_by_user].index)
    # filter rating-user data to have only books/users of interest (which have highest rates count and rated highest number of books respectively)
    rating_final_df=ratings_df[ratings_df['Book-Title'].isin(titles_with_acceptable_rates_count)&ratings_df['User-ID'].isin(user_ids_with_acceptable_books_count_rated)]
    return rating_final_df

In [4]:
#Creating dataframes from csv files to read the data
books_df_original = pd.read_csv('./Resources/Books.csv')
ratings_df_original = pd.read_csv('./Resources/Ratings.csv')

  books_df_original = pd.read_csv('./Resources/Books.csv')


In [5]:
# remove duplicated books records if any by looking at ISBN
books_df=books_df_original.copy()
books_df=books_df.drop_duplicates(subset=['ISBN'])

In [6]:
# update the datatype of a 'Year-Of-Publication' field to numeric one
books_df['Year-Of-Publication']=pd.to_numeric(books_df['Year-Of-Publication'],errors='coerce')
# Filter out data with no publication year
books_df = books_df[books_df['Year-Of-Publication'] > 0]
books_df['Year-Of-Publication']=books_df['Year-Of-Publication'].astype(int)

In [7]:
books_df_algo_input=books_df.copy()

In [8]:
ratings_df=ratings_df_original.copy()
# update the datatype of a 'Book-Rating' field to numeric one
ratings_df['Book-Rating']=pd.to_numeric(ratings_df['Book-Rating'],errors='coerce')

### Change ISBN with Titles
Merge ratings with books data in order to change isbn with title and leave only those ratings data for which we have title info

In [9]:
ratings_df=pd.merge(books_df,ratings_df,on='ISBN', how = 'inner')
ratings_df.columns

Index(['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher',
       'Image-URL-S', 'Image-URL-M', 'Image-URL-L', 'User-ID', 'Book-Rating'],
      dtype='object')

In [10]:
# delete those rows with no book titles or no book rates if any (even though there should no be such as we used inner join above)
ratings_df=ratings_df.dropna()
ratings_df_all_cols=ratings_df.copy()
ratings_df_all_cols=ratings_df_all_cols[['Book-Title','User-ID','Book-Rating','Book-Author','Publisher']]
# delete those columns, which we are not going to use within machine learning algos
ratings_df=ratings_df.drop(['ISBN','Book-Author','Year-Of-Publication','Publisher','Image-URL-S','Image-URL-M','Image-URL-L'], axis=1)

In [11]:
ratings_df_all_cols['Publisher']=ratings_df_all_cols['Publisher'].str.replace('&amp;', 'and')
ratings_df_all_cols['Book-Author']=ratings_df_all_cols['Book-Author'].str.replace('Ã©', 'é')

In [12]:
ratings_df_algo_input=ratings_df.copy()

In [13]:
# Filter out data with zero ratings
ratings_df_no_zeros = ratings_df[ratings_df['Book-Rating'] != 0]

In [14]:
# As an alternative to the above update 0 scores with weighted averages

In [15]:
# find average score per each book (only take non-zero into account)
avg_ratings_scored = ratings_df[ratings_df['Book-Rating'] > 0].groupby('Book-Title')['Book-Rating'].mean()
# count of non-zero rating given per book
count_ratings_scored = ratings_df[ratings_df['Book-Rating'] > 0].groupby('Book-Title')['Book-Rating'].count()
# create dataframe with above data (average and count) per book
average_weighted_df=pd.DataFrame(avg_ratings_scored).rename(columns={'Book-Rating':'avg_book_rating'})
count_ratings_scored_df=pd.DataFrame(count_ratings_scored).rename(columns={'Book-Rating':'count_book_rating'})
average_weighted_df=pd.merge(average_weighted_df,count_ratings_scored_df,  on='Book-Title', how='inner')
average_weighted_df=average_weighted_df.sort_values(by='count_book_rating', ascending=False)
# find average weighted per book
average_weighted_df['avg_weighted']=average_weighted_df.apply(average_weighted, axis=1)
# update zero rating values with average weighted
ratings_df_adj=ratings_df.copy()
ratings_df_adj.loc[ratings_df_adj['Book-Rating'] == 0, 'Book-Rating'] = ratings_df_adj.loc[ratings_df_adj['Book-Rating'] == 0].index.map(average_weighted_df['avg_weighted'])
# Filter out data with n/a rating score after mapping, as there could be books with only 0 scores
ratings_df_adj=ratings_df_adj.dropna(subset=['Book-Rating'])

In [16]:
# Use avg rate per duplicates set for three: ratings_df, ratings_df_adj and ratings_df_no_zeros
ratings_df_original=ratings_df.groupby(['Book-Title','User-ID'])['Book-Rating'].mean().reset_index()
ratings_df_mean=ratings_df_adj.groupby(['Book-Title','User-ID'])['Book-Rating'].mean().reset_index()
ratings_df_no_zeros=ratings_df_no_zeros.groupby(['Book-Title','User-ID'])['Book-Rating'].mean().reset_index()

In [17]:
# leave only statistically significant data for both ratings_df_adj and ratings_df_no_zeros
rating_final_original_df=leave_stat_sign_data(ratings_df_original)
rating_final_mean_df=leave_stat_sign_data(ratings_df_adj)
rating_final_no_zeros_df=leave_stat_sign_data(ratings_df_no_zeros)

In [18]:
pk.dump(ratings_df_all_cols,open('ratings_df_all_cols.pkl','wb'))

In [19]:
pk.dump(ratings_df_algo_input,open('ratings_df_algo_input.pkl','wb'))
pk.dump(books_df_algo_input,open('books_df_algo_input.pkl','wb'))