In [None]:
import pandas as pd
import numpy as np

In [None]:
class Book_Processing :
    
    def __init__(self, books_path, users_path, train_ratings_path) :
        
        self.books = pd.read_csv(books_path)
        self.users = pd.read_csv(users_path)
        self.train_ratings = pd.read_csv(train_ratings_path)

    def calculate_user_statistics(self) :
        
        users_groupby = self.train_ratings.groupby("user_id")
        average_user_rating = users_groupby["rating"].mean()
        number_of_ratings_by_user = users_groupby["rating"].count()
        
        average_user_rating.name = "avg_rating"
        number_of_ratings_by_user.name = "N_ratings"
        
        self.users = self.users.join(number_of_ratings_by_user, on="user_id")
        self.users = self.users.join(average_user_rating, on="user_id")
        
    def fill_na_values(self) :
        
        self.users["N_ratings"] = self.users["N_ratings"].fillna(0).astype("int64")
        self.users["avg_rating"] = self.users["avg_rating"].fillna(5).astype("int64")
        
    def split_location_parts(self, location) :
        
            location_parts = location.split(", ")
            city, state, country = '', '', ''
            if len(location_parts) == 3:
                city, state, country = location_parts
            elif len(location_parts) == 2:
                city, country = location_parts
            else:
                country = location_parts[0]
            return city, state, country
        
    def split_location(self) :

        location_parts = self.users['location'].apply(self.split_location_parts).tolist()
        self.users[['city', 'state', 'country']] = pd.DataFrame(location_parts)
        self.users['country'] = self.users['country'].str.replace('[^\w\s]', '', regex=True)

    def fill_missing_age(self) :
        
        country_mean_age = self.users.groupby('country')['age'].mean()
        self.users['age'] = self.users.apply(lambda row: country_mean_age[row['country']] if pd.isnull(row['age']) else row['age'], axis=1)
        self.users['age'] = self.users['age'].fillna(self.users['age'].mean())

    def generate_random_values(self) : 
        
        categories = list(self.books['category'].value_counts()[self.books['category'].value_counts() >= 50].index)
        authors = list(self.books['book_author'].value_counts()[self.books['book_author'].value_counts() >= 5].index)
        countries = self.users['country'].unique()

        self.books['category'] = np.where(self.books['category'].isna(), np.random.choice(categories), self.books['category'])
        self.books['book_author'] = np.where(self.books['book_author'].isna(), np.random.choice(authors), self.books['book_author'])
        self.users['country'] = np.where(self.users['country'].isna(), np.random.choice(countries), self.users['country'])

    def drop_unnecessary_columns(self):
        
        self.books.drop(['book_title', 'img_url', 'language', 'summary', 'img_path'], axis=1, inplace=True)
        self.users.drop(['location', 'city', 'state'], axis=1, inplace=True)
        
    def preprocess_data(self) :
        
        self.calculate_user_statistics()
        self.fill_na_values()
        self.split_location()
        self.fill_missing_age()
        self.generate_random_values()
        self.drop_unnecessary_columns()

    def get_processed_data(self) :
        
        return self.books, self.users