#  IMPORTING LIBRARIES, LOADING AND MERGING THE DATA

In [51]:
import pandas as pd
import numpy as np
pd.options.display.max_rows=350
pd.options.display.max_columns=40

pd.options.mode.chained_assignment = None 
import warnings
warnings.filterwarnings('ignore')

In [3]:
descriptions=pd.read_csv('descriptionsdfclean.csv')

In [4]:
descriptions.shape

(62395, 15)

In [5]:
descriptions.publisher.replace({"HarperCollins Publishers": "HarperCollins", 
                      'Scholastic Press':'Scholastic', 'Scholastic Inc.':'Scholastic',
                      'Scholastic, Inc.':'Scholastic', 'Scholastic Paperbacks':'Scholastic','Puffin':'Puffin Books', 
                      'Knopf Books for Young Readers': 'Knopf',
                     'Candlewick Press': 'Candlewick', }, inplace=True)

In [6]:
book_id=pd.read_csv('book_id_total.csv')

In [7]:
book_id.shape

(60004, 1)

In [8]:
df60= pd.merge(book_id,descriptions, on=["book_id"])

In [9]:
df60.shape

(60004, 15)

In [10]:
df60.publisher.value_counts()

Scholastic                     3349
HarperCollins                  2640
HMH Books for Young Readers    1882
Puffin Books                   1509
Candlewick                     1439
                               ... 
Marshall Editions                 1
Secant Publishing                 1
Hester Pub.                       1
Sonlight Curriculum               1
Shining Star Press                1
Name: publisher, Length: 5234, dtype: int64

# PREPARING THE DATA FOR WEBSCRAPING

In [11]:
df60['title_2'] = df60['title'].str.replace(' ', '_')

In [12]:
df60['book_id_str'] = df60['book_id'].apply(str)

In [13]:
df60['book_id_title']=df60['book_id_str'] + '.' + df60['title_2']

In [14]:
df60.columns

Index(['book_id', 'isbn', 'text_reviews_count', 'is_ebook', 'average_rating',
       'description', 'format', 'publisher', 'num_pages', 'isbn13',
       'publication_year', 'ratings_count', 'title', 'descriptiondetect',
       'titledetect', 'title_2', 'book_id_str', 'book_id_title'],
      dtype='object')

In [15]:
df60.book_id_title

0                           3636.The_Giver_(The_Giver,_#1)
1                              11387515.Wonder_(Wonder_#1)
2        5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...
3                                 157993.The_Little_Prince
4                                  38709.Holes_(Holes,_#1)
                               ...                        
59999                             18223803.The_Cold_People
60000      34381539.Beep_Beep_Robot!_A_Spinning_Gears_Book
60001                       34381587.Touch_and_Feel_Winter
60002                            33938951.Peek-a-Boo_Baby!
60003                    4614268.The_Story_Of_Three_Whales
Name: book_id_title, Length: 60004, dtype: object

In [17]:
def get_uniques(df,lim=20):
    for col in df.columns:
        array=list(df[col].values)
        uniques=set(array)
        if len(uniques) < lim:
            print(col, ":", uniques)
        else:
            print(col, ":", len(uniques), "unique values.")

In [18]:
get_uniques(df60)

book_id : 60004 unique values.
isbn : 58707 unique values.
text_reviews_count : 971 unique values.
is_ebook : {False, True}
average_rating : 284 unique values.
description : 60004 unique values.
format : 70 unique values.
publisher : 5234 unique values.
num_pages : 602 unique values.
isbn13 : 59750 unique values.
publication_year : 106 unique values.
ratings_count : 3824 unique values.
title : 60004 unique values.
descriptiondetect : 60004 unique values.
titledetect : 60004 unique values.
title_2 : 60004 unique values.
book_id_str : 60004 unique values.
book_id_title : 60004 unique values.


In [19]:
popular_books=df60.sort_values(['text_reviews_count','ratings_count','average_rating'], ascending= False)

In [20]:
popular_books.head(5)

Unnamed: 0,book_id,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect,title_2,book_id_str,book_id_title
0,3636,0385732554,49850,False,4.12,Twelve-year-old Jonas lives in a seemingly ide...,Paperback,Ember,208.0,9780385732550,2006,1311422,"The Giver (The Giver, #1)",twelve-year-old jonas lives in a seemingly ide...,"the giver (the giver, #1)","The_Giver_(The_Giver,_#1)",3636,"3636.The_Giver_(The_Giver,_#1)"
1,11387515,0375869026,31536,False,4.43,I won't describe what I look like. Whatever yo...,Hardcover,Knopf,316.0,9780375869020,2012,255461,Wonder (Wonder #1),i won't describe what i look like. whatever yo...,wonder (wonder #1),Wonder_(Wonder_#1),11387515,11387515.Wonder_(Wonder_#1)
2,5,043965548X,28561,False,4.53,Harry Potter's third year at Hogwarts is full ...,Mass Market Paperback,Scholastic,435.0,9780439655484,2004,1876252,Harry Potter and the Prisoner of Azkaban (Harr...,harry potter's third year at hogwarts is full ...,harry potter and the prisoner of azkaban (harr...,Harry_Potter_and_the_Prisoner_of_Azkaban_(Harr...,5,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...
3,157993,0156012197,16639,False,4.28,"Moral allegory and spiritual autobiography, Th...",Paperback,"Harcourt, Inc.",93.0,9780156012195,2000,763309,The Little Prince,"moral allegory and spiritual autobiography, th...",the little prince,The_Little_Prince,157993,157993.The_Little_Prince
4,38709,0439244196,14851,False,3.93,Stanley tries to dig up the truth in this inve...,Paperback,Scholastic,233.0,9780439244190,2000,766680,"Holes (Holes, #1)",stanley tries to dig up the truth in this inve...,"holes (holes, #1)","Holes_(Holes,_#1)",38709,"38709.Holes_(Holes,_#1)"


In [21]:
popular_books.tail(5)

Unnamed: 0,book_id,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect,title_2,book_id_str,book_id_title
56826,788020,0874490715,1,False,2.0,Juxtaposes the traditional tale of Henny Penny...,Hardcover,Modern Publishing,0.0,9780874490718,1987,1,"Henny Penny, the Gingerbread Boy, Three Billy ...",juxtaposes the traditional tale of henny penny...,"henny penny, the gingerbread boy, three billy ...","Henny_Penny,_the_Gingerbread_Boy,_Three_Billy_...",788020,"788020.Henny_Penny,_the_Gingerbread_Boy,_Three..."
58646,3644037,1570913994,1,False,2.0,A baby's face is a study in motion: one minute...,Board Book,Charlesbridge,10.0,9781570913990,2001,1,Baby Face,a baby's face is a study in motion: one minute...,baby face,Baby_Face,3644037,3644037.Baby_Face
58078,30079968,0778725766,1,False,1.5,"The great outdoors is the perfect ""Maker"" spac...",Hardcover,Crabtree Publishing Company,32.0,9780778725763,2016,1,Maker Projects for Kids Who Love Exploring the...,"the great outdoors is the perfect ""maker"" spac...",maker projects for kids who love exploring the...,Maker_Projects_for_Kids_Who_Love_Exploring_the...,30079968,30079968.Maker_Projects_for_Kids_Who_Love_Expl...
55833,5128656,1588381471,1,False,1.0,Little Brother Real Snake is the coming-of-age...,Paperback,JuneBug Books,141.0,9781588381477,2004,1,Little Brother Real Snake,little brother real snake is the coming-of-age...,little brother real snake,Little_Brother_Real_Snake,5128656,5128656.Little_Brother_Real_Snake
59830,5809680,089565203X,1,False,1.0,Examples from daily life characterize honesty.,not defined,Child's World,0.0,9780895652034,1981,1,Honesty (What is it?) Values to Live By,examples from daily life characterize honesty.,honesty (what is it?) values to live by,Honesty_(What_is_it?)_Values_to_Live_By,5809680,5809680.Honesty_(What_is_it?)_Values_to_Live_By


In [None]:
#popular_books2=df60.sort_values(['text_reviews_count','ratings_count'], ascending= False)

In [None]:
#popular_books2.head(5)

In [None]:
#popular_books2.tail()

In [22]:
pop1000books=popular_books.head(1000)

In [23]:
pop1000books.shape

(1000, 18)

## Saving the data

In [24]:
file_name='60004titlestoscrap.csv'
df60.book_id_title.to_csv(file_name, index=False)

In [25]:
file_name2='descriptionsdf_Final.csv'
df60.to_csv(file_name2,index=False)

In [26]:
file_name3='1000titlestoscrap.csv'
pop1000books.book_id_title.to_csv(file_name3,index=False)

In [27]:
pop1000books.book_id_title

0                         3636.The_Giver_(The_Giver,_#1)
1                            11387515.Wonder_(Wonder_#1)
2      5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...
3                               157993.The_Little_Prince
4                                38709.Holes_(Holes,_#1)
                             ...                        
992                               3231457.Chicken_Cheeks
991                             22504710.Castle_Hangnail
998    774600.In_the_Year_of_the_Boar_and_Jackie_Robi...
997        6081980.The_Chestnut_King_(100_Cupboards,_#3)
999                      773514.Emily_Climbs_(Emily,_#2)
Name: book_id_title, Length: 1000, dtype: object

# WEBSCRAPING THE AUTHORS WITH BEAUTIFULSOUP

In [28]:
import argparse
from datetime import datetime
import os
import re
import time

from urllib.request import urlopen
from urllib.error import HTTPError
import bs4

In [29]:
data=pd.read_csv('1000titlestoscrap.csv')

In [27]:
data

Unnamed: 0,book_id_title
0,"3636.The_Giver_(The_Giver,_#1)"
1,11387515.Wonder_(Wonder_#1)
2,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...
3,157993.The_Little_Prince
4,"38709.Holes_(Holes,_#1)"
...,...
995,3231457.Chicken_Cheeks
996,22504710.Castle_Hangnail
997,774600.In_the_Year_of_the_Boar_and_Jackie_Robi...
998,"6081980.The_Chestnut_King_(100_Cupboards,_#3)"


In [32]:
data

Unnamed: 0,book_id_title,scraped
0,"3636.The_Giver_(The_Giver,_#1)",[Lois Lowry]
1,11387515.Wonder_(Wonder_#1),[R.J. Palacio]
2,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...,[J.K. Rowling]
3,157993.The_Little_Prince,[Antoine de Saint-Exupéry]
4,"38709.Holes_(Holes,_#1)",[None]
...,...,...
995,3231457.Chicken_Cheeks,[Michael Ian Black]
996,22504710.Castle_Hangnail,[Ursula Vernon]
997,774600.In_the_Year_of_the_Boar_and_Jackie_Robi...,[Bette Bao Lord]
998,"6081980.The_Chestnut_King_(100_Cupboards,_#3)",[N.D. Wilson]


In [33]:
data_b = data.scraped.apply(pd.Series)
data_b.columns = ['author']

In [34]:
data_b

Unnamed: 0,author
0,Lois Lowry
1,R.J. Palacio
2,J.K. Rowling
3,Antoine de Saint-Exupéry
4,
...,...
995,Michael Ian Black
996,Ursula Vernon
997,Bette Bao Lord
998,N.D. Wilson


In [35]:
data_c = data_b["author"].str.split(expand=True)
data_c.columns = ['first_name', 'second_name', 'third_name','last_name']

In [36]:
data_c

Unnamed: 0,first_name,second_name,third_name,last_name
0,Lois,Lowry,,
1,R.J.,Palacio,,
2,J.K.,Rowling,,
3,Antoine,de,Saint-Exupéry,
4,,,,
...,...,...,...,...
995,Michael,Ian,Black,
996,Ursula,Vernon,,
997,Bette,Bao,Lord,
998,N.D.,Wilson,,


In [37]:
data1=data_c.drop(['second_name', 'third_name','last_name'],axis=1)

In [38]:
data1

Unnamed: 0,first_name
0,Lois
1,R.J.
2,J.K.
3,Antoine
4,
...,...
995,Michael
996,Ursula
997,Bette
998,N.D.


In [39]:
pop1000authors=pd.concat([data,data_b,data1],axis=1)

In [40]:
pop1000authors=pop1000authors.drop(['scraped'],axis=1)

# IMPORTING THE 1000 BOOKS WITH SCRAPED AUTHORS

In [30]:
pop1000authors = pd.read_csv('pop1000authorsfinal2.csv')

In [31]:
pop1000authors.tail(3)

Unnamed: 0,book_id_title,author,first_name
997,774600.In_the_Year_of_the_Boar_and_Jackie_Robi...,Bette Bao Lord,Bette
998,"6081980.The_Chestnut_King_(100_Cupboards,_#3)",N.D. Wilson,N.D.
999,"773514.Emily_Climbs_(Emily,_#2)",L.M. Montgomery,L.M.


## Filling manually some authors not correctly scraped

In [32]:
pop1000authors[pop1000authors.author=='None'].count()

book_id_title    27
author           27
first_name       27
dtype: int64

In [33]:
pop1000authors[pop1000authors.author=='None']

Unnamed: 0,book_id_title,author,first_name
4,"38709.Holes_(Holes,_#1)",,
49,16054808.Escape_from_Mr._Lemoncello's_Library_...,,
91,"7783920.Because_of_Mr._Terupt_(Mr._Terupt,_#1)",,
154,207802.Ida_B._._._and_Her_Plans_to_Maximize_Fu...,,
190,13083239.The_Fantastic_Flying_Books_of_Mr._Mor...,,
200,958277.Junie_B._Jones_and_the__Stupid_Smelly_B...,,
285,9642662.Me...Jane,,
293,17165875.Mr._Wuffles!,,
303,"25051.Mrs._Piggle-Wiggle_(Mrs._Piggle_Wiggle,_#1)",,
309,160943.Martin's_Big_Words:_The_Life_of_Dr._Mar...,,


In [34]:
export27authorstofill=pop1000authors[pop1000authors.author=='None']

In [35]:
export27authorstofill.to_csv('27authorstofill.csv')

In [36]:
pop27authors = pd.read_csv('27authorsupdated.csv')

In [37]:
pop27authors

Unnamed: 0.1,Unnamed: 0,book_id_title,author,first_name
0,4,"38709.Holes_(Holes,_#1)",Louis Sachar,Louis
1,49,16054808-escape-from-mr-lemoncello-s-library,Chris Grabenstein,Chris
2,91,7783920-because-of-mr-terupt,Rob Buyea,Rob
3,154,207802.Ida_B_and_Her_Plans_to_Maximize_Fun_Avo...,Katherine Hannigan,Katherine
4,190,13083239-the-fantastic-flying-books-of-mr-morr...,William Joyce,William
5,200,958277.Junie_B_Jones_and_the_Stupid_Smelly_Bus,Barbara Park,Barbara
6,285,9642662-me-jane,Patrick McDonnell,Patrick
7,293,17165875-mr-wuffles,David Wiesner,David
8,303,25051.Mrs_Piggle_Wiggle,Betty MacDonald,Betty
9,309,160943.Martin_s_Big_Words,Doreen Rappaport,Doreen


In [38]:
pop27authors.set_index('Unnamed: 0',inplace=True)

In [39]:
pop27authors=pop27authors.rename_axis(None, axis = 0)

In [40]:
pop27authors

Unnamed: 0,book_id_title,author,first_name
4,"38709.Holes_(Holes,_#1)",Louis Sachar,Louis
49,16054808-escape-from-mr-lemoncello-s-library,Chris Grabenstein,Chris
91,7783920-because-of-mr-terupt,Rob Buyea,Rob
154,207802.Ida_B_and_Her_Plans_to_Maximize_Fun_Avo...,Katherine Hannigan,Katherine
190,13083239-the-fantastic-flying-books-of-mr-morr...,William Joyce,William
200,958277.Junie_B_Jones_and_the_Stupid_Smelly_Bus,Barbara Park,Barbara
285,9642662-me-jane,Patrick McDonnell,Patrick
293,17165875-mr-wuffles,David Wiesner,David
303,25051.Mrs_Piggle_Wiggle,Betty MacDonald,Betty
309,160943.Martin_s_Big_Words,Doreen Rappaport,Doreen


In [41]:
pop1000authors.loc[pop27authors.index] = pop27authors

In [42]:
pop1000authors

Unnamed: 0,book_id_title,author,first_name
0,"3636.The_Giver_(The_Giver,_#1)",Lois Lowry,Lois
1,11387515.Wonder_(Wonder_#1),R.J. Palacio,R.J.
2,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...,J.K. Rowling,J.K.
3,157993.The_Little_Prince,Antoine de Saint-Exupéry,Antoine
4,"38709.Holes_(Holes,_#1)",Louis Sachar,Louis
...,...,...,...
995,3231457.Chicken_Cheeks,Michael Ian Black,Michael
996,22504710.Castle_Hangnail,Ursula Vernon,Ursula
997,774600.In_the_Year_of_the_Boar_and_Jackie_Robi...,Bette Bao Lord,Bette
998,"6081980.The_Chestnut_King_(100_Cupboards,_#3)",N.D. Wilson,N.D.


In [43]:
pop1000authors.shape

(1000, 3)

In [44]:
pop1000authors[pop1000authors.author=='None']

Unnamed: 0,book_id_title,author,first_name


# GENDERIZE

In [None]:
#!pip install genderize

In [61]:
from genderize import Genderize

In [62]:
import os
directory = '../output'

if not os.path.exists(directory):
    os.makedirs(directory)

In [63]:
pop1000authors.first_name

0         Lois
1         R.J.
2         J.K.
3      Antoine
4        Louis
        ...   
995    Michael
996     Ursula
997      Bette
998       N.D.
999       L.M.
Name: first_name, Length: 1000, dtype: object

In [65]:
# Run this code just once a day
gender = Genderize().get(list(pop1000authors.first_name))

In [66]:
gender

[{'name': 'Lois', 'gender': 'female', 'probability': 0.58, 'count': 2510},
 {'name': 'R.J.', 'gender': None, 'probability': 0.0, 'count': 0},
 {'name': 'J.K.', 'gender': None, 'probability': 0.0, 'count': 0},
 {'name': 'Antoine', 'gender': 'male', 'probability': 0.99, 'count': 38236},
 {'name': 'Louis', 'gender': 'male', 'probability': 0.97, 'count': 26805},
 {'name': 'Shel', 'gender': 'female', 'probability': 0.78, 'count': 169},
 {'name': 'Jeff', 'gender': 'male', 'probability': 0.99, 'count': 32948},
 {'name': 'Brian', 'gender': 'male', 'probability': 0.99, 'count': 51483},
 {'name': 'E.B.', 'gender': None, 'probability': 0.0, 'count': 0},
 {'name': 'Lemony', 'gender': 'female', 'probability': 0.59, 'count': 17},
 {'name': 'L.M.', 'gender': None, 'probability': 0.0, 'count': 0},
 {'name': 'Katherine', 'gender': 'female', 'probability': 0.98, 'count': 8451},
 {'name': 'Frances', 'gender': 'female', 'probability': 0.88, 'count': 3173},
 {'name': 'Gary', 'gender': 'male', 'probability'

In [67]:
#Re-arrange eveything into a dataframe
gender = pd.DataFrame(gender).iloc[:,0:3]
gender.columns = ['first_name', 'gender', 'probability']

In [68]:
gender

Unnamed: 0,first_name,gender,probability
0,Lois,female,0.58
1,R.J.,,0.00
2,J.K.,,0.00
3,Antoine,male,0.99
4,Louis,male,0.97
...,...,...,...
995,Michael,male,0.99
996,Ursula,female,0.98
997,Bette,female,0.89
998,N.D.,,0.00


In [75]:
gender.to_csv('gender2.csv', index=False)

In [45]:
gender=pd.read_csv('gender2.csv')

In [46]:
pop1000authors.head()

Unnamed: 0,book_id_title,author,first_name
0,"3636.The_Giver_(The_Giver,_#1)",Lois Lowry,Lois
1,11387515.Wonder_(Wonder_#1),R.J. Palacio,R.J.
2,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...,J.K. Rowling,J.K.
3,157993.The_Little_Prince,Antoine de Saint-Exupéry,Antoine
4,"38709.Holes_(Holes,_#1)",Louis Sachar,Louis


In [47]:
# Merge both dataframes
final_1000authors = pd.concat([pop1000authors,gender],axis=1)
final_1000authors

Unnamed: 0,book_id_title,author,first_name,first_name.1,gender,probability
0,"3636.The_Giver_(The_Giver,_#1)",Lois Lowry,Lois,Lois,female,0.58
1,11387515.Wonder_(Wonder_#1),R.J. Palacio,R.J.,R.J.,,0.00
2,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...,J.K. Rowling,J.K.,J.K.,,0.00
3,157993.The_Little_Prince,Antoine de Saint-Exupéry,Antoine,Antoine,male,0.99
4,"38709.Holes_(Holes,_#1)",Louis Sachar,Louis,Louis,male,0.97
...,...,...,...,...,...,...
995,3231457.Chicken_Cheeks,Michael Ian Black,Michael,Michael,male,0.99
996,22504710.Castle_Hangnail,Ursula Vernon,Ursula,Ursula,female,0.98
997,774600.In_the_Year_of_the_Boar_and_Jackie_Robi...,Bette Bao Lord,Bette,Bette,female,0.89
998,"6081980.The_Chestnut_King_(100_Cupboards,_#3)",N.D. Wilson,N.D.,N.D.,,0.00


In [48]:
final_1000authors= final_1000authors.loc[:,~final_1000authors.columns.duplicated()]

In [49]:
final_1000authors.gender.isnull().sum()

71

In [52]:
final_1000authors.gender=final_1000authors.gender.fillna('unknown')

In [53]:
final_1000authors.gender.value_counts()

female     470
male       459
unknown     71
Name: gender, dtype: int64

In [54]:
final_1000authors.to_csv('final_1000authors_gender2', index=False)

In [55]:
final_1000authors[(final_1000authors.probability > 0) & (final_1000authors.probability < 0.80)]

Unnamed: 0,book_id_title,author,first_name,gender,probability
0,"3636.The_Giver_(The_Giver,_#1)",Lois Lowry,Lois,female,0.58
5,370493.The_Giving_Tree,Shel Silverstein,Shel,female,0.78
9,78411.The_Bad_Beginning_(A_Series_of_Unfortuna...,Lemony Snicket,Lemony,female,0.59
14,30119.Where_the_Sidewalk_Ends,Shel Silverstein,Shel,female,0.78
42,78418.The_Reptile_Room_(A_Series_of_Unfortunat...,Lemony Snicket,Lemony,female,0.59
55,438492.The_Wide_Window_(A_Series_of_Unfortunat...,Lemony Snicket,Lemony,female,0.59
67,"65112.The_End_(A_Series_of_Unfortunate_Events,...",Lemony Snicket,Lemony,female,0.59
84,65119.The_Miserable_Mill_(A_Series_of_Unfortun...,Lemony Snicket,Lemony,female,0.59
90,131123.The_Austere_Academy_(A_Series_of_Unfort...,Lemony Snicket,Lemony,female,0.59
94,310258.The_Snowy_Day,Ezra Jack Keats,Ezra,male,0.75


In [133]:
#final_1000authors.loc[final_1000authors.first_name =='Shel','gender'] ='male'

In [56]:
list_of_values = ['Blue','Tui','Joan','Andrea','Sydney','Karma','Pat','Guojing','Mem','Jody']

final_1000authors.loc[final_1000authors.first_name.isin (list_of_values),'gender']='female'


In [57]:
list_of_values2 = ['Shel','Lemony','Ezra','Tomie','Jory']

final_1000authors.loc[final_1000authors.first_name.isin (list_of_values2),'gender']='male'

In [58]:
final_1000authors[(final_1000authors.probability > 0) & (final_1000authors.probability < 0.80)]

Unnamed: 0,book_id_title,author,first_name,gender,probability
0,"3636.The_Giver_(The_Giver,_#1)",Lois Lowry,Lois,female,0.58
5,370493.The_Giving_Tree,Shel Silverstein,Shel,male,0.78
9,78411.The_Bad_Beginning_(A_Series_of_Unfortuna...,Lemony Snicket,Lemony,male,0.59
14,30119.Where_the_Sidewalk_Ends,Shel Silverstein,Shel,male,0.78
42,78418.The_Reptile_Room_(A_Series_of_Unfortunat...,Lemony Snicket,Lemony,male,0.59
55,438492.The_Wide_Window_(A_Series_of_Unfortunat...,Lemony Snicket,Lemony,male,0.59
67,"65112.The_End_(A_Series_of_Unfortunate_Events,...",Lemony Snicket,Lemony,male,0.59
84,65119.The_Miserable_Mill_(A_Series_of_Unfortun...,Lemony Snicket,Lemony,male,0.59
90,131123.The_Austere_Academy_(A_Series_of_Unfort...,Lemony Snicket,Lemony,male,0.59
94,310258.The_Snowy_Day,Ezra Jack Keats,Ezra,male,0.75


In [59]:
final_1000authors[final_1000authors.probability == 0]

Unnamed: 0,book_id_title,author,first_name,gender,probability
1,11387515.Wonder_(Wonder_#1),R.J. Palacio,R.J.,unknown,0.0
2,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...,J.K. Rowling,J.K.,unknown,0.0
8,24178.Charlotte's_Web,E.B. White,E.B.,unknown,0.0
10,8127.Anne_of_Green_Gables_(Anne_of_Green_Gable...,L.M. Montgomery,L.M.,unknown,0.0
20,23772.Green_Eggs_and_Ham,Dr. Seuss,Dr.,unknown,0.0
24,"236093.The_Wonderful_Wizard_of_Oz_(Oz,_#1)",L. Frank Baum,L.,unknown,0.0
38,233093.The_Cat_in_the_Hat,Dr. Seuss,Dr.,unknown,0.0
46,"191139.Oh,_The_Places_You'll_Go!",Dr. Seuss,Dr.,unknown,0.0
54,140225.The_Voyage_of_the_Dawn_Treader_(Chronic...,C.S. Lewis,C.S.,unknown,0.0
64,7784.The_Lorax,Dr. Seuss,Dr.,unknown,0.0


In [60]:
list_of_values3 = ['R.J.','J.K.','L.M.','P.L.','E.L.','Esphyr','M.M.','E.', 'Mélanie']

final_1000authors.loc[final_1000authors.first_name.isin (list_of_values3),'gender']='female'

In [61]:
list_of_values4 = ['E.B.','Dr.','L.','C.S.','Pseudonymous','B.J.','A.A.','P.D.','Patrick', 'H.A.', 
                  'J.A.','JonArno', 'R.L.', 'Javaka', 'J.R.R.', 'A.F.', 'S.D.', ' Patrick', 'N.D.']

final_1000authors.loc[final_1000authors.first_name.isin (list_of_values4),'gender']='male'

In [62]:
final_1000authors.gender.value_counts()

male      517
female    483
Name: gender, dtype: int64

In [63]:
final_1000authors[final_1000authors.probability >= 0.80].sample(50)

Unnamed: 0,book_id_title,author,first_name,gender,probability
711,16074748.Tales_from_a_Not-So-Happy_Heartbreake...,Rachel Renée Russell,Rachel,female,0.98
544,10508526.The_Chronicles_of_Harris_Burdick:_14_...,Chris Van Allsburg,Chris,male,0.9
451,37191.Mercy_Watson_to_the_Rescue_(Mercy_Watson...,Kate DiCamillo,Kate,female,0.98
444,88455.The_Chocolate_Touch,Patrick Skene Catling,Patrick,male,0.99
97,20575434.Rain_Reign,Ann M. Martin,Ann,female,0.97
985,23399241.Woof_(Bowser_and_Birdie_#1),Spencer Quinn,Spencer,male,0.96
672,295801.The_House_with_a_Clock_in_Its_Walls_(Le...,John Bellairs,John,male,0.99
44,"2921082.The_Maze_of_Bones__(The_39_Clues,_#1)",Rick Riordan,Rick,male,0.99
155,18295821.The_Adventures_of_Beekle:_The_Unimagi...,Dan Santat,Dan,male,0.95
347,10803806.Darth_Paper_Strikes_Back_(Origami_Yod...,Tom Angleberger,Tom,male,0.99


In [64]:
final_1000authors.loc[final_1000authors.first_name =='Jean','gender'] ='female'

In [65]:
final_1000authors.gender.value_counts()

male      513
female    487
Name: gender, dtype: int64

In [66]:
final_1000authors.author.nunique()

554

In [67]:
final_1000authors[final_1000authors.gender=='female'].author.nunique()

298

In [68]:
final_1000authors[final_1000authors.gender=='male'].author.nunique()

256

In [69]:
final_1000authors

Unnamed: 0,book_id_title,author,first_name,gender,probability
0,"3636.The_Giver_(The_Giver,_#1)",Lois Lowry,Lois,female,0.58
1,11387515.Wonder_(Wonder_#1),R.J. Palacio,R.J.,female,0.00
2,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...,J.K. Rowling,J.K.,female,0.00
3,157993.The_Little_Prince,Antoine de Saint-Exupéry,Antoine,male,0.99
4,"38709.Holes_(Holes,_#1)",Louis Sachar,Louis,male,0.97
...,...,...,...,...,...
995,3231457.Chicken_Cheeks,Michael Ian Black,Michael,male,0.99
996,22504710.Castle_Hangnail,Ursula Vernon,Ursula,female,0.98
997,774600.In_the_Year_of_the_Boar_and_Jackie_Robi...,Bette Bao Lord,Bette,female,0.89
998,"6081980.The_Chestnut_King_(100_Cupboards,_#3)",N.D. Wilson,N.D.,male,0.00


In [70]:
pop1000books.head(2)

Unnamed: 0,book_id,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect,title_2,book_id_str,book_id_title
0,3636,385732554,49850,False,4.12,Twelve-year-old Jonas lives in a seemingly ide...,Paperback,Ember,208.0,9780385732550,2006,1311422,"The Giver (The Giver, #1)",twelve-year-old jonas lives in a seemingly ide...,"the giver (the giver, #1)","The_Giver_(The_Giver,_#1)",3636,"3636.The_Giver_(The_Giver,_#1)"
1,11387515,375869026,31536,False,4.43,I won't describe what I look like. Whatever yo...,Hardcover,Knopf,316.0,9780375869020,2012,255461,Wonder (Wonder #1),i won't describe what i look like. whatever yo...,wonder (wonder #1),Wonder_(Wonder_#1),11387515,11387515.Wonder_(Wonder_#1)


In [71]:
pop1000books.columns

Index(['book_id', 'isbn', 'text_reviews_count', 'is_ebook', 'average_rating',
       'description', 'format', 'publisher', 'num_pages', 'isbn13',
       'publication_year', 'ratings_count', 'title', 'descriptiondetect',
       'titledetect', 'title_2', 'book_id_str', 'book_id_title'],
      dtype='object')

In [72]:
pop1000books.shape

(1000, 18)

In [73]:
pop1000books_clean=pop1000books.drop(['title_2','book_id_str', 'book_id_title'], axis=1)

In [74]:
pop1000books_clean.shape

(1000, 15)

In [75]:
#pop1000books_clean2=pop1000books_clean.reset_index()

In [76]:
# Merge both dataframes
df_1000 = pd.concat([pop1000books_clean, final_1000authors],axis=1)
df_1000

Unnamed: 0,book_id,isbn,text_reviews_count,is_ebook,average_rating,description,format,publisher,num_pages,isbn13,publication_year,ratings_count,title,descriptiondetect,titledetect,book_id_title,author,first_name,gender,probability
0,3636,0385732554,49850,False,4.12,Twelve-year-old Jonas lives in a seemingly ide...,Paperback,Ember,208.0,9780385732550,2006,1311422,"The Giver (The Giver, #1)",twelve-year-old jonas lives in a seemingly ide...,"the giver (the giver, #1)","3636.The_Giver_(The_Giver,_#1)",Lois Lowry,Lois,female,0.58
1,11387515,0375869026,31536,False,4.43,I won't describe what I look like. Whatever yo...,Hardcover,Knopf,316.0,9780375869020,2012,255461,Wonder (Wonder #1),i won't describe what i look like. whatever yo...,wonder (wonder #1),11387515.Wonder_(Wonder_#1),R.J. Palacio,R.J.,female,0.00
2,5,043965548X,28561,False,4.53,Harry Potter's third year at Hogwarts is full ...,Mass Market Paperback,Scholastic,435.0,9780439655484,2004,1876252,Harry Potter and the Prisoner of Azkaban (Harr...,harry potter's third year at hogwarts is full ...,harry potter and the prisoner of azkaban (harr...,5.Harry_Potter_and_the_Prisoner_of_Azkaban_(Ha...,J.K. Rowling,J.K.,female,0.00
3,157993,0156012197,16639,False,4.28,"Moral allegory and spiritual autobiography, Th...",Paperback,"Harcourt, Inc.",93.0,9780156012195,2000,763309,The Little Prince,"moral allegory and spiritual autobiography, th...",the little prince,157993.The_Little_Prince,Antoine de Saint-Exupéry,Antoine,male,0.99
4,38709,0439244196,14851,False,3.93,Stanley tries to dig up the truth in this inve...,Paperback,Scholastic,233.0,9780439244190,2000,766680,"Holes (Holes, #1)",stanley tries to dig up the truth in this inve...,"holes (holes, #1)","38709.Holes_(Holes,_#1)",Louis Sachar,Louis,male,0.97
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,67243,1428045961,333,False,3.85,Excerpt from book: Sup't John Grier Home. Dear...,Paperback,IndyPublish.com,350.0,9781428045965,2006,3696,"Dear Enemy (Daddy-Long-Legs, #2)",excerpt from book: sup't john grier home. dear...,"dear enemy (daddy-long-legs, #2)",3231457.Chicken_Cheeks,Michael Ian Black,Michael,male,0.99
996,153866,0439998190,333,False,4.13,Professor Pippy P. (Pee-Pee) Poopypants is a b...,Paperback,Scholastic,160.0,9780439998192,2001,10005,Captain Underpants and the Perilous Plot of Pr...,professor pippy p. (pee-pee) poopypants is a b...,captain underpants and the perilous plot of pr...,22504710.Castle_Hangnail,Ursula Vernon,Ursula,female,0.98
997,6081980,0375838856,332,False,4.12,When Henry York found 99 cupboards hidden behi...,Mass Market Paperback,Random House,482.0,9780375838859,2010,3293,"The Chestnut King (100 Cupboards, #3)",when henry york found 99 cupboards hidden behi...,"the chestnut king (100 cupboards, #3)",774600.In_the_Year_of_the_Boar_and_Jackie_Robi...,Bette Bao Lord,Bette,female,0.89
998,774600,0064401758,332,False,3.81,Shirley Temple Wong sails from China to Americ...,Paperback,HarperCollins,176.0,9780064401753,2003,4465,In the Year of the Boar and Jackie Robinson,shirley temple wong sails from china to americ...,in the year of the boar and jackie robinson,"6081980.The_Chestnut_King_(100_Cupboards,_#3)",N.D. Wilson,N.D.,male,0.00


In [77]:
df_1000.to_csv('df_1000final.csv',index=False)