In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [3]:
path=""
users = pd.read_csv(path+'/users.csv')
books=pd.read_csv(path+'/books.csv')
rating=pd.read_csv(path+'/train_ratings.csv')
test=pd.read_csv(path+'/train_ratings.csv')

print('users shape: ', users.shape)
print('books shape: ', books.shape)
print('rating shape: ', rating.shape)
print('test shape: ', test.shape)

users shape:  (68092, 3)
books shape:  (149570, 10)
rating shape:  (306795, 3)
test shape:  (306795, 3)


# users

In [4]:
users

Unnamed: 0,user_id,location,age
0,8,"timmins, ontario, canada",
1,11400,"ottawa, ontario, canada",49.0
2,11676,"n/a, n/a, n/a",
3,67544,"toronto, ontario, canada",30.0
4,85526,"victoria, british columbia, canada",36.0
...,...,...,...
68087,278376,"danville, pennsylvania, usa",54.0
68088,278621,"victoria, delaware, canada",74.0
68089,278636,"irvington, alabama, usa",
68090,278659,"vancouver, washington, usa",33.0


In [5]:
users.describe()

Unnamed: 0,user_id,age
count,68092.0,40259.0
mean,139381.329539,36.069873
std,80523.969862,13.842571
min,8.0,5.0
25%,69008.75,25.0
50%,138845.5,34.0
75%,209388.25,45.0
max,278854.0,99.0


In [6]:
users.info()  

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68092 entries, 0 to 68091
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   user_id   68092 non-null  int64  
 1   location  68092 non-null  object 
 2   age       40259 non-null  float64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB


In [7]:
users.isna().sum()

user_id         0
location        0
age         27833
dtype: int64

In [15]:
class Spliter(BaseEstimator, TransformerMixin):   #location split
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X['location']=X['location'].apply(lambda x: re.sub('[^0-9a-zA-Z:,]+', '', x))
        X['location_city']=X['location'].apply(lambda x: x.split(',')[0])
        X['location_state']=X['location'].apply(lambda x: x.split(',')[1])
        X['location_country']=X['location'].apply(lambda x: x.split(',')[-1])
        
        #usa replace
        usa_repl=['america', 'us', 'unitedstate', 'unitedstates']
        X.loc[X[X['location_country'].isin(usa_repl)].index, 'location_country']="usa"
        
        X=X.replace('na', np.nan) 
        X=X.replace('', np.nan)
        return X
        
class Filllocbycity(BaseEstimator, TransformerMixin):  #city를 통해 다른 결측 처리
    def __init__(self, col):
        self.col=col
        i = 1 if col=="location_state" else -1
        self.i=i
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        right=[]
        null_city=X[(X[self.col].isna())&(X["location_city"].notnull())]["location_city"].values
        for city in null_city:
            try:
                right.append(X[(X["location_city"]==city)&(X[self.col].notnull())]['location'].value_counts().index[0])
            except:
                pass
        for r in right:
            X.loc[X[X['location_city']==r.split(',')[0]].index, self.col]=r.split(',')[self.i]
        return X
    
class Filllocbystate(BaseEstimator, TransformerMixin):   #state를 통해 다른 결측 처리
    def __init__(self, col):
        self.col=col
        i = 0 if col=="location_city" else -1
        self.i=i
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        right=[]
        null_state=X[(X[self.col].isna())&(X["location_state"].notnull())]["location_state"].values
        for state in null_state:
            try:
                right.append(X[(X["location_state"]==state)&(X[self.col].notnull())]['location'].value_counts().index[0])
            except:
                pass
        for r in right:
            X.loc[X[X['location_state']==r.split(',')[1]].index, self.col]=r.split(',')[self.i]
        return X
    
class Fillna(BaseEstimator, TransformerMixin):    #그외 na는 location 최빈값, others로 처리
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        loc=X["location"].value_counts().index[0]
        allna=X[X[['location_city', 'location_state', 'location_country']].isna().all(1)]
        X.loc[allna.index, 'location_city']=loc.split(',')[0]
        X.loc[allna.index, 'location_state']=loc.split(',')[1]
        X.loc[allna.index, 'location_country']=loc.split(',')[-1]
        
        X['location_state']=X['location_state'].fillna(X['location_country'])
        X['location_city']=X['location_city'].fillna(X['location_state'])
        
        X['location_state']=X['location_state'].fillna("others")
        X['location_country']=X['location_country'].fillna("others")
        return X
        
class Fillage(BaseEstimator, TransformerMixin):  #국가별 나이 평균으로 결측 처리
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        agect=pd.DataFrame(X.groupby("location_country")["age"].mean())
        agectdict=agect.to_dict()['age']
        X["age"]=X["age"].fillna(X['location_country'].map(agectdict))
        X["age"]=X["age"].fillna(X["age"].mean())
        return X
    
class AddAvg(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        user_avg=rating.merge(X, on="user_id", how="left").groupby("user_id")["rating"].agg(["mean"])
        X=X.merge(user_avg, on="user_id", how="left")
        X.rename(columns = {'mean' : 'avg'}, inplace = True)        
        return X


In [16]:
preprocess=Pipeline([
            ("spliter", Spliter()), 
            ("fillstcty", Filllocbycity('location_state')), 
            ("fillctcty", Filllocbycity('location_country')), 
            ("fillctyst", Filllocbystate('location_city')),
            ("fillctst", Filllocbystate('location_country')),
            ("fillna", Fillna()),
            ("fillage", Fillage()), 
            ("addavg", AddAvg())
])

In [17]:
users_ppd=preprocess.fit_transform(users)
users_ppd

Unnamed: 0,user_id,location,age,location_city,location_state,location_country,avg
0,8,"timmins,ontario,canada",35.733836,timmins,ontario,canada,4.428571
1,11400,"ottawa,ontario,canada",49.000000,ottawa,ontario,canada,6.750000
2,11676,"na,na,na",35.733836,toronto,ontario,canada,6.779891
3,67544,"toronto,ontario,canada",30.000000,toronto,ontario,canada,7.285714
4,85526,"victoria,britishcolumbia,canada",36.000000,vancouver,britishcolumbia,canada,7.666667
...,...,...,...,...,...,...,...
68087,278376,"danville,pennsylvania,usa",54.000000,danville,pennsylvania,usa,7.000000
68088,278621,"victoria,delaware,canada",74.000000,vancouver,britishcolumbia,canada,8.000000
68089,278636,"irvington,alabama,usa",37.845259,irvington,alabama,usa,2.000000
68090,278659,"vancouver,washington,usa",33.000000,vancouver,britishcolumbia,canada,10.000000


In [18]:
users_ppd.isna().sum()

user_id                0
location               0
age                    0
location_city          0
location_state         0
location_country       0
avg                 8289
dtype: int64

## rating 기록이 없는 user avg 예측

In [19]:
#training

In [21]:
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

def rmse(real, predict):
            return np.sqrt(np.mean((real-predict) ** 2))
    
Xwrate=users_ppd[users_ppd.avg.notnull()][["age", "location_city", "location_country"]]

for c in Xwrate.columns:
    if Xwrate[c].dtype=="object":
        Xwrate[c]=Xwrate[c].astype("category")
        
X_train, X_valid, y_train, y_valid = train_test_split = train_test_split(Xwrate, users_ppd[users_ppd.avg.notnull()]["avg"]
                ,test_size=0.2, random_state=42, shuffle=True)
clf=LGBMRegressor()
clf.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], eval_metric='rmse')
y_pred=clf.predict(X_valid)
print(rmse(y_valid, y_pred))



[1]	valid_0's rmse: 2.13185	valid_0's l2: 4.5448
[2]	valid_0's rmse: 2.12981	valid_0's l2: 4.53608
[3]	valid_0's rmse: 2.12829	valid_0's l2: 4.5296
[4]	valid_0's rmse: 2.12736	valid_0's l2: 4.52565
[5]	valid_0's rmse: 2.12654	valid_0's l2: 4.52219
[6]	valid_0's rmse: 2.12606	valid_0's l2: 4.52012
[7]	valid_0's rmse: 2.1259	valid_0's l2: 4.51943
[8]	valid_0's rmse: 2.126	valid_0's l2: 4.51987
[9]	valid_0's rmse: 2.12621	valid_0's l2: 4.52076
[10]	valid_0's rmse: 2.12643	valid_0's l2: 4.52169
[11]	valid_0's rmse: 2.12659	valid_0's l2: 4.52239
[12]	valid_0's rmse: 2.12691	valid_0's l2: 4.52374
[13]	valid_0's rmse: 2.12718	valid_0's l2: 4.52489
[14]	valid_0's rmse: 2.12745	valid_0's l2: 4.52604
[15]	valid_0's rmse: 2.1278	valid_0's l2: 4.52753
[16]	valid_0's rmse: 2.12808	valid_0's l2: 4.52874
[17]	valid_0's rmse: 2.1284	valid_0's l2: 4.53007
[18]	valid_0's rmse: 2.12871	valid_0's l2: 4.5314
[19]	valid_0's rmse: 2.12909	valid_0's l2: 4.53301
[20]	valid_0's rmse: 2.12942	valid_0's l2: 4.534

In [20]:
#predict

In [22]:
worate=users_ppd[users_ppd.avg.isna()]
X_test=worate[["age", "location_city", "location_country"]]

for c in X_test.columns:
    if X_test[c].dtype=="object":
        X_test[c]=X_test[c].astype("category")
        
predicts=clf.predict(X_test)
worate.loc[:, "avg"]=predicts
worateavgdict=worate[["user_id", "avg"]].set_index("user_id").to_dict()["avg"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[c]=X_test[c].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[c]=X_test[c].astype("category")


In [24]:
users_ppd["avg"]=users_ppd["avg"].fillna(users_ppd["user_id"].map(worateavgdict))

In [25]:
users_ppd

Unnamed: 0,user_id,location,age,location_city,location_state,location_country,avg
0,8,"timmins,ontario,canada",35.733836,timmins,ontario,canada,4.428571
1,11400,"ottawa,ontario,canada",49.000000,ottawa,ontario,canada,6.750000
2,11676,"na,na,na",35.733836,toronto,ontario,canada,6.779891
3,67544,"toronto,ontario,canada",30.000000,toronto,ontario,canada,7.285714
4,85526,"victoria,britishcolumbia,canada",36.000000,vancouver,britishcolumbia,canada,7.666667
...,...,...,...,...,...,...,...
68087,278376,"danville,pennsylvania,usa",54.000000,danville,pennsylvania,usa,7.000000
68088,278621,"victoria,delaware,canada",74.000000,vancouver,britishcolumbia,canada,8.000000
68089,278636,"irvington,alabama,usa",37.845259,irvington,alabama,usa,2.000000
68090,278659,"vancouver,washington,usa",33.000000,vancouver,britishcolumbia,canada,10.000000


In [26]:
users_ppd.to_csv("users_ppd_test.csv", index=False)

# Books

In [27]:
books

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path
0,0002005018,Clara Callan,Richard Bruce Wright,2001.0,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,en,['Actresses'],"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg
1,0060973129,Decision in Normandy,Carlo D'Este,1991.0,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,en,['1940-1949'],"Here, for the first time in paperback, is an o...",images/0060973129.01.THUMBZZZ.jpg
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,en,['Medical'],"Describes the great flu epidemic of 1918, an o...",images/0374157065.01.THUMBZZZ.jpg
3,0399135782,The Kitchen God's Wife,Amy Tan,1991.0,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.0...,en,['Fiction'],A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg
4,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,Berkley Publishing Group,http://images.amazon.com/images/P/0425176428.0...,en,['History'],"Essays by respected military historians, inclu...",images/0425176428.01.THUMBZZZ.jpg
...,...,...,...,...,...,...,...,...,...,...
149565,067161746X,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,1987.0,Pocket Books,http://images.amazon.com/images/P/067161746X.0...,en,['Humor'],A tongue-in-cheek survival guide for single pe...,images/067161746X.01.THUMBZZZ.jpg
149566,0767907566,All Elevations Unknown: An Adventure in the He...,Sam Lightner,2001.0,Broadway Books,http://images.amazon.com/images/P/0767907566.0...,en,['Nature'],A daring twist on the travel-adventure genre t...,images/0767907566.01.THUMBZZZ.jpg
149567,0884159221,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985.0,Lone Star Books,http://images.amazon.com/images/P/0884159221.0...,,,,images/0884159221.01.THUMBZZZ.jpg
149568,0912333022,The Are You Being Served? Stories: 'Camping In...,Jeremy Lloyd,1997.0,Kqed Books,http://images.amazon.com/images/P/0912333022.0...,en,['Fiction'],These hilarious stories by the creator of publ...,images/0912333022.01.THUMBZZZ.jpg


In [28]:
books.isna().sum()

isbn                       0
book_title                 0
book_author                1
year_of_publication        0
publisher                  0
img_url                    0
language               67227
category               68851
summary                67227
img_path                   0
dtype: int64

In [29]:
class Groupingpublisher(BaseEstimator, TransformerMixin):  #대표 publisher로 통합
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X["publisher"]=X["publisher"].apply(lambda x: re.sub('[\W_]+',' ',x).lower()).str.strip()
        publisher_dict=(X['publisher'].value_counts()).to_dict()
        publisher_count_df= pd.DataFrame(list(publisher_dict.items()),columns = ['publisher','count'])
        modify_list = publisher_count_df[publisher_count_df['count']>1].publisher.values
        for publisher in modify_list:
            try:
                number = X[X['publisher']==publisher]['isbn'].apply(lambda x: x[:4]).value_counts().index[0]
                right_publisher = X[X['isbn'].apply(lambda x: x[:4])==number]['publisher'].value_counts().index[0]
                X.loc[X[X['isbn'].apply(lambda x: x[:4])==number].index,'publisher'] = right_publisher
            except: 
                pass
        return X
    
class Fillcate(BaseEstimator, TransformerMixin):  #책 title로 category 결측 처리
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X.loc[X[X['category'].notnull()].index, 'category'] = X[X['category'].notnull()]['category'].apply(lambda x: re.sub('[\W_]+','',x).strip())
        X['category'] = X['category'].str.lower()
        X['category']=X['category'].fillna(X["book_title"].map(X[["book_title", "category"]].dropna().drop_duplicates('book_title').set_index('book_title')['category']))
        return X
    
class Fillcatebyauthor(BaseEstimator, TransformerMixin):  #author별 최다 category로 결측 처리
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X["book_author"]=X["book_author"].fillna("none")
        author_cate=X[X["category"].notnull()].groupby("book_author")["category"].agg(lambda x: x.value_counts().index[0]).to_dict()
        X['category']=X['category'].fillna(X['book_author'].map(author_cate))
        X['category']=X['category'].fillna("others")  #나머지 다 other로
        return X
    
class Updatecate(BaseEstimator, TransformerMixin):     #category 통합
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        categories = {   
            'animal'         : ['animal', 'bird', 'pets', 'cats', 'dogs', 'bears', 'dino', 'horse', 'elephant', 'frog'],
            'arts'           : ['art', 'photography', 'architecture', 'music', 'criticism', 'perform', 'design', 'paint', 
                                'decorat', 'draw', 'act', 'picture', 'author', 'composer', 'antiques'],
            'biographies'    : ['biography', 'memoir', 'presidents'],
            'business'       : ['business', 'money', 'economic', 'finance', 'invest', 'management', 'sales', 'marketing', 'authorship'],
            'comic'          : ['comic', 'graphic', 'cartoons', 'comedians'],
            'computer'       : ['computer', 'technology', 'software', 'artificial intelligence'],
            'cook'           : ['cook', 'food', 'wine', 'baking', 'desserts', 'beverage', 'Gooseberry Patch'],
            'crime'          : ['crime', 'murder', 'child abuse', 'kidnapping'],
            'education'      : ['education', 'teach', 'test', 'study', 'book'],
            'engineering'    : ['engineer', 'transportation', 'electronic', 'aeronautics', 'robots'],
            'entertainment'  : ['humor', 'entertainment', 'game', 'entertainers'],
            'family'         : ['child', 'famil', 'parent', 'marriage', 'baby', 'wedding', 'brother', 
                                'sister', 'boy', 'girl', 'aunt', 'courtship', 'adoption', 'infants', 'grandmothers'],
            'health'         : ['health', 'fitness', 'diet', 'body'],
            'history'        : ['history', 'war', 'archaeology', 'civilization', 'atomic bomb'],
            'hobby'          : ['craft', 'hobby', 'home', 'garden', 'landscape', 'collect'],
            'juvenile'       : ['student', 'school', 'teen', 'young', 'juvenile',  'adolescence', 'bildungsromans'],
            'knowledge'      : ['curiosit','encyclopedias'],
            'law'            : ['law', 'legal', 'divorce'],
            'life'           : ['life', 'friendship', 'relation', 'death', 'love', 'aging'],
            'medical'        : ['medical', 'pharmacology', 'medicine', 'dentistry', 'disease', 'cancer', 'drug'],
            'mystery'        : ['mystery', 'extraterrestrial', 'wonder', 'magic',  'dragon', 'monsters', 'unidentified flying objects', 'aliens', 'elves'],
            'philosophy'     : ['cabala', 'ethics'],
            'psychology'     : ['dream', 'mind', 'tarot', 'anger', 'fear', 'consciousness', 'emotions', 'depressions'],
            'reference'      : ['reference'],
            'religion'       : ['christian', 'bible', 'religion', 'spirit', 'church', 'catholic', 'angel', 'buddhism', 
                                'bereavement', 'cults', 'spiritual', 'clergy'],
            'self_help'      : ['help', 'behavior', 'beauty personal', 'home economics', 'communication', 'clothing'],
            'sports'         : ['sport', 'outdoor', 'baseball', 'exercise', 'soccer', 'ballet'],
            'thriller'       : ['thriller', 'suspense','ghost'],
            'travel'         : ['travel', 'voyage'],
            'world'          : ['english', 'england', 'australia', 'brit', 'africa', 'states', 'france', 'canada', 'america','chile' 'spain', 'italy'
                                'china', 'egypt', 'germa', 'ireland', 'california', 'europe', 'russia', 'netherlands' 'hollywood', 'brazil'],
            'social_science' : ['social', 'politic', 'psychology', 'philosophy', 'politic', 'government', 'geography','feminism', 'women'],
            'science'        : ['science', 'nature', 'math', 'astronomy', 'astrology', 'brain', 'cosmology', 'chemistry'],
            'nature'         : ['nature', 'arctic regions', 'alaska'],
            'science fiction' : ['interplanetary', 'extraterrestrial', 'dune imaginary place', 'vampire'], 
            'fantasy fiction' : ['fantasy', 'fairy', 'tale', 'fairies'],
            'love fiction'   : ['romance', 'adultery', 'erotic'],
            'literature'     : ['literature', 'fiction','fictitious',  'drama', 'poetry', 'stories', 
                                'collections', 'horror', 'adventure', 'actresses', 'actor', 'blind', 
                                'folklore', 'books and reading', 'discworld', 'intelligence service', 'cowboys'],
        }  #https://github.com/boostcampaitech4recsys2/level1_bookratingprediction_recsys-level1-recsys-12/blob/db154476d51cd94f962fabd57b1789f1b1775caa/preprocess.py 참고함 

        for high in categories:
            for cate in categories[high]:
                X.loc[X[X['category'].str.contains(cate,na=False)].index,'new_cate'] = high
                X.loc[X[X['category'].str.contains(cate,na=False)].index,'category'] = cate
        X['new_cate']=X['new_cate'].fillna("others")
        return X
    
class Filllang(BaseEstimator, TransformerMixin):  #language random sampling
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        rch=np.random.choice(['en', 'de', 'es', 'fr', 'it'], p=[0.9, 0.05, 0.02, 0.02, 0.01], size=X["language"].isna().sum())
        X.loc[X["language"].isna(), "language"]=rch
        return X
        


In [30]:
bpreprocess=Pipeline([
            ("fillct", Fillcate()), 
            ('fillcatebyat', Fillcatebyauthor()), 
            ("Updatecate", Updatecate()),
            ("filllang", Filllang()),
            ('groupingpb', Groupingpublisher()),
            
])

In [32]:
books_ppd=bpreprocess.fit_transform(books)
books_ppd

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher,img_url,language,category,summary,img_path,new_cate
0,0002005018,Clara Callan,Richard Bruce Wright,2001.0,harpercollins,http://images.amazon.com/images/P/0002005018.0...,en,act,"In a small town in Canada, Clara Callan reluct...",images/0002005018.01.THUMBZZZ.jpg,arts
1,0060973129,Decision in Normandy,Carlo D'Este,1991.0,harpercollins,http://images.amazon.com/images/P/0060973129.0...,en,19401949,"Here, for the first time in paperback, is an o...",images/0060973129.01.THUMBZZZ.jpg,others
2,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999.0,farrar straus giroux,http://images.amazon.com/images/P/0374157065.0...,en,medical,"Describes the great flu epidemic of 1918, an o...",images/0374157065.01.THUMBZZZ.jpg,medical
3,0399135782,The Kitchen God's Wife,Amy Tan,1991.0,putnam pub group,http://images.amazon.com/images/P/0399135782.0...,en,fiction,A Chinese immigrant who is convinced she is dy...,images/0399135782.01.THUMBZZZ.jpg,literature
4,0425176428,What If?: The World's Foremost Military Histor...,Robert Cowley,2000.0,berkley publishing group,http://images.amazon.com/images/P/0425176428.0...,en,history,"Essays by respected military historians, inclu...",images/0425176428.01.THUMBZZZ.jpg,history
...,...,...,...,...,...,...,...,...,...,...,...
149565,067161746X,The Bachelor Home Companion: A Practical Guide...,P.J. O'Rourke,1987.0,pocket,http://images.amazon.com/images/P/067161746X.0...,en,humor,A tongue-in-cheek survival guide for single pe...,images/067161746X.01.THUMBZZZ.jpg,entertainment
149566,0767907566,All Elevations Unknown: An Adventure in the He...,Sam Lightner,2001.0,broadway books,http://images.amazon.com/images/P/0767907566.0...,en,nature,A daring twist on the travel-adventure genre t...,images/0767907566.01.THUMBZZZ.jpg,nature
149567,0884159221,Why stop?: A guide to Texas historical roadsid...,Claude Dooley,1985.0,bridge publications,http://images.amazon.com/images/P/0884159221.0...,en,others,,images/0884159221.01.THUMBZZZ.jpg,others
149568,0912333022,The Are You Being Served? Stories: 'Camping In...,Jeremy Lloyd,1997.0,pub group west,http://images.amazon.com/images/P/0912333022.0...,en,fiction,These hilarious stories by the creator of publ...,images/0912333022.01.THUMBZZZ.jpg,literature


In [33]:
books_ppd.isna().sum()

isbn                       0
book_title                 0
book_author                0
year_of_publication        0
publisher                  0
img_url                    0
language                   0
category                   0
summary                67227
img_path                   0
new_cate                   0
dtype: int64

In [34]:
books_ppd.to_csv("books_ppd_test.csv", index=False)