In [None]:
import pandas as pd
import numpy as np
import re
import csv
from sklearn.model_selection import train_test_split

In [None]:
def createCSV(nameOfFile, data):
    with open(nameOfFile+'.csv', 'w') as csvFile:
        writer = csv.writer(csvFile)
        writer.writerows(data)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('./drive/My Drive/ML Project/data/alldata.csv')
data = data.set_index('isbn')
data.columns

Index(['title', 'author', 'imageLink', 'bookSummary', 'bookFormat',
       'yearPublished', 'genre', 'noOfPages', 'rating', 'ratingCount',
       'reviewCount', 'language', 'noOfAwards', 'authorFollowers'],
      dtype='object')

# Data Preparation

In [None]:
all_data = data.values
#Making list of generes
for row in all_data:
    try:
        row[6] = row[6].split('//')
    except:
        row[6] = []
    year = re.findall(r"[0-9]{4,4}", row[5])
    if year:
        row[5] = int(year[-1])
    else:
        row[5] = None


## Train Test Split

In [None]:
dummy_Y = np.ones((data.shape[0],))
data_train, data_test, y_dummyTrain, y_dummyTest = train_test_split(all_data, dummy_Y, test_size=0.2, random_state=5929)
print(data_train[0][11])

English


# Preparing data

## Genre

In [None]:
#Preparing genres list
genres = {}
for row in data_train:
    for genre in row[6]:
        if genre not in genres:
            genres[genre] = 0
        genres[genre] += 1

#Removing uncommon genres
final_genres = set([])
for genre in genres: 
	if genres[genre] >= 3000:
		final_genres.add(genre)
final_genres = list(final_genres)
print(final_genres)


['Contemporary', 'Paranormal', 'Sequential Art', 'Audiobook', 'Thriller', 'Childrens', 'Fiction', 'Adult', 'Mystery', 'Young Adult', 'Fantasy', 'Literature', 'Science Fiction', 'Historical', 'Nonfiction', 'Romance']


In [None]:
drop_samples = []
for idx in range(len(data_train)):
    if data_train[idx][7] >= 80000:
        drop_samples.append(idx)

data_train = np.delete(data_train, drop_samples, 0)

## Book format

In [None]:
book_formats = ['mass', 'hardcover', 'paperback', 'ebook', 'kindle edition', 'audio', 'other']

for row in data_train:
	try:
		row[11] = row[11].lower()
	except:
		row[11] = 'other'
	if row[11] == 'english':
		row[11] = 1
	else:
		row[11] = 0
	row[4] = row[4].lower()
	flag = False
	for book_format in book_formats:
		if book_format in row[4]:
			row[4] = book_format
			flag = True
			break
	if not flag:
		row[4] = 'other'

for row in data_test:
	try:
		row[11] = row[11].lower()
	except:
		row[11] = 'other'
	if row[11] == 'english':
		row[11] = 1
	else:
		row[11] = 0
	row[4] = row[4].lower()
	flag = False
	for book_format in book_formats:
		if book_format in row[4]:
			row[4] = book_format
			flag = True
			break
	if not flag:
		row[4] = 'other'

print(book_formats)

['mass', 'hardcover', 'paperback', 'ebook', 'kindle edition', 'audio', 'other']


# One hot encoding

## Book Format

In [None]:
one_hot_encode = [[0]*7 for i in range(data_train.shape[0])]
one_hot_encode = np.array(one_hot_encode)
for row_idx in range(len(data_train)):
    row = data_train[row_idx]
    idx = book_formats.index(row[4])
    one_hot_encode[row_idx][idx] = 1
data_train = np.delete(data_train, 4, 1)
data_train = np.concatenate((data_train, one_hot_encode), axis=1)

In [None]:
one_hot_encode = [[0]*7 for i in range(data_test.shape[0])]
one_hot_encode = np.array(one_hot_encode)
for row_idx in range(len(data_test)):
    row = data_test[row_idx]
    idx = book_formats.index(row[4])
    one_hot_encode[row_idx][idx] = 1
data_test = np.delete(data_test, 4, 1)
data_test = np.concatenate((data_test, one_hot_encode), axis=1)

## Genre

In [None]:
one_hot_encode = [[0]*len(final_genres) for i in range(data_train.shape[0])]
one_hot_encode = np.array(one_hot_encode)
for row_idx in range(len(data_train)):
    row = data_train[row_idx]
    for genre_idx in range(len(final_genres)):
        genre = final_genres[genre_idx]
        if genre in row[5]:
            one_hot_encode[row_idx][genre_idx] = 1

data_train = np.delete(data_train, 5, 1)
data_train = np.concatenate((data_train, one_hot_encode), axis=1)

In [None]:
one_hot_encode = [[0]*len(final_genres) for i in range(data_test.shape[0])]
one_hot_encode = np.array(one_hot_encode)
for row_idx in range(len(data_test)):
    row = data_test[row_idx]
    for genre_idx in range(len(final_genres)):
        genre = final_genres[genre_idx]
        if genre in row[5]:
            one_hot_encode[row_idx][genre_idx] = 1

data_test = np.delete(data_test, 5, 1)
data_test = np.concatenate((data_test, one_hot_encode), axis=1)

In [None]:
print(data_train[0])

['\nQ-Ko-Chan 1: The Earth Invader Girl (Q-Ko-Chan: The Earth Invader Girl, #1)\n'
 'Hajime Ueda'
 'https://i.gr-assets.com/images/S/compressed.photo.goodreads.com/books/1320470721l/404537.jpg'
 'GREETINGS, EARTHLINGIn the near-future on planet Earth, a world gone mad where never-ending war is a fact of life, Kirio is the coolest kid at school. Up in the sky, a giant robot is fighting a fleet of gunships, but the brilliant and distant Kirio is far from fazed–until the battling ’bot makes an unexpected landing in Kirio’s front yard and rings the bell. But the worst threat for Kirio could be what stands on the other side of the door: an alien invader robot with the face of an adorable girl!'
 2006 208 3.24 128 17 1 0 13 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:
df_train = pd.DataFrame(data_train)
df_train.columns = ['title', 'author', 'imageLink', 'bookSummary',
       'yearPublished', 'noOfPages', 'rating', 'ratingCount',
       'reviewCount', 'language', 'noOfAwards', 'authorFollowers', 
       'bookFormat: mass', 'bookFormat: hardcover', 'bookFormat: paperback', 'bookFormat: ebook', 'bookFormat: kindle edition', 'bookFormat: audio', 'bookFormat: other',
       'genre: Science Fiction', 'genre: Adult', 'genre: Young Adult', 'genre: Thriller', 'genre: Nonfiction', 'genre: Audiobook', 'genre: Historical', 'genre: Contemporary', 'genre: Paranormal', 'genre: Childrens', 'genre: Fiction', 'genre: Literature', 'genre: Sequential Art', 'genre: Fantasy', 'genre: Mystery', 'genre: Romance'
       ]
df_train = df_train.dropna()
df_train.to_csv('./drive/My Drive/ML Project/data/data_train.csv', index=False)  


In [None]:
df_test = pd.DataFrame(data_test)
df_test.columns = ['title', 'author', 'imageLink', 'bookSummary',
       'yearPublished', 'noOfPages', 'rating', 'ratingCount',
       'reviewCount', 'language', 'noOfAwards', 'authorFollowers', 
       'bookFormat: mass', 'bookFormat: hardcover', 'bookFormat: paperback', 'bookFormat: ebook', 'bookFormat: kindle edition', 'bookFormat: audio', 'bookFormat: other',
       'genre: Science Fiction', 'genre: Adult', 'genre: Young Adult', 'genre: Thriller', 'genre: Nonfiction', 'genre: Audiobook', 'genre: Historical', 'genre: Contemporary', 'genre: Paranormal', 'genre: Childrens', 'genre: Fiction', 'genre: Literature', 'genre: Sequential Art', 'genre: Fantasy', 'genre: Mystery', 'genre: Romance'
       ]
df_test = df_test.dropna()
df_test.to_csv('./drive/My Drive/ML Project/data/data_test.csv', index=False)  


In [None]:
df_train.head()

Unnamed: 0,title,author,imageLink,bookSummary,yearPublished,noOfPages,rating,ratingCount,reviewCount,language,noOfAwards,authorFollowers,bookFormat: mass,bookFormat: hardcover,bookFormat: paperback,bookFormat: ebook,bookFormat: kindle edition,bookFormat: audio,bookFormat: other,genre: Science Fiction,genre: Adult,genre: Young Adult,genre: Thriller,genre: Nonfiction,genre: Audiobook,genre: Historical,genre: Contemporary,genre: Paranormal,genre: Childrens,genre: Fiction,genre: Literature,genre: Sequential Art,genre: Fantasy,genre: Mystery,genre: Romance
0,\nQ-Ko-Chan 1: The Earth Invader Girl (Q-Ko-Ch...,Hajime Ueda,https://i.gr-assets.com/images/S/compressed.ph...,"GREETINGS, EARTHLINGIn the near-future on plan...",2006,208,3.24,128,17,1,0,13,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,"\nFables, Vol. 2: Animal Farm\n",Malala Yousafzai,https://i.gr-assets.com/images/S/compressed.ph...,Ever since they were driven from their homelan...,2003,128,4.11,29232,1374,1,0,2770,0,0,1,0,0,0,0,0,0,1,0,0,0,1,1,0,0,1,0,0,0,0,0
2,"\nMemorial Day (Mitch Rapp, #7)\n",Vince Flynn,https://i.gr-assets.com/images/S/compressed.ph...,"Fighting terrorism on foreign ground, CIA supe...",2004,574,4.3,44750,1007,1,0,5037,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0
3,\nThe Diving Bell and the Butterfly\n,Jean-Dominique Bauby,https://i.gr-assets.com/images/S/compressed.ph...,‘Locked-in syndrome: paralysed from head to to...,1997,132,4.0,58863,4587,1,0,201,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,\nScience Ink: Tattoos of the Science Obsessed\n,G.E. Swanson,https://i.gr-assets.com/images/S/compressed.ph...,Body art meets popular science in this elegant...,2011,271,3.87,1158,145,1,0,1138,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
