# ML Analysis
Will attempt to do some sort of clustering.
## Set up Environment

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# vectorizing the book info column using TFidf Vectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
path = os.path.join(os.path.normpath(os.getcwd() + os.sep + os.pardir) + "/data/")

In [4]:
os.listdir(path + 'cleaned')

['.DS_Store',
 'BX-Books.csv',
 'BX-Ratings.csv',
 'BX-Users.csv',
 'BX-NewBooksRatings.csv',
 '.ipynb_checkpoints']

## Pull in required data:

In [32]:
books = pd.read_csv(path + 'cleaned/BX-Books.csv')
users = pd.read_csv(path + 'cleaned/BX-Users.csv')
ratings = pd.read_csv(path + 'cleaned/BX-Ratings.csv')

In [33]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher,Book-Info,Book-Vector
0,2005018,clara callan,richard bruce wright,2001.0,harperflamingo canada,clara callan by richard bruce wright,[0. 0. 0. ... 0. 0. 0.]
1,374157065,flu story great influenza pandemic 1918 search...,gina bari kolata,1999.0,farrar straus giroux,flu story great influenza pandemic 1918 search...,[0. 0. 0. ... 0. 0. 0.]
2,399135782,kitchen god wife,amy tan,1991.0,putnam pub group,kitchen god wife by amy tan,[0. 0. 0. ... 0. 0. 0.]
3,440234743,testament,john grisham,1999.0,dell,testament by john grisham,[0. 0. 0. ... 0. 0. 0.]
4,452264464,beloved plume contemporary fiction,toni morrison,1994.0,plume,beloved plume contemporary fiction by toni mor...,[0. 0. 0. ... 0. 0. 0.]


In [34]:
tf = TfidfVectorizer(analyzer = "word", ngram_range=(1,2), min_df=0, stop_words='english')

In [35]:
tfidf_matrix = tf.fit(books['Book-Info'])

In [36]:
book_vector = tfidf_matrix.transform(books['Book-Info']).toarray()

In [37]:
books['Book-Vector'] = list(book_vector)

In [38]:
book = tfidf_matrix.transform(['pet sematary by stephen king']).toarray()

In [None]:
%%time
similarities = {}
for idx, row in books.iterrows():
    book_vector = row['Book-Vector']
    book_vector = book_vector.reshape(1, -1)
    similarity_score = cosine_similarity(book[0].reshape(1,-1), book_vector)[0][0]
    similarities[row['Book-Info']] = similarity_score

In [None]:
top_5_books = sorted(similarities.items(), key=lambda item: item[1], reverse=True)[:5]

In [None]:
top_5_books

In [28]:

books[books['Book-Title'] == 'It']

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher
1764,450411435,It,Stephen King,0,Trafalgar Square
2370,670813028,It,Stephen King,1986,Viking Books
6573,451149513,It,Stephen King,1987,New Amer Library
8961,451169514,It,Stephen King,1997,Signet Book


In [24]:
books = pd.read_csv(path + 'raw/BX-Books.csv')

In [30]:
books.isna().sum()

ISBN                    0
Book-Title             31
Book-Author             0
Year-Of-Publication     0
Book-Publisher          0
Book-Info               0
Book-Vector             0
dtype: int64

In [31]:
books[books['Book-Title'].isna()]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Book-Publisher,Book-Info,Book-Vector
666,0330376136,,james herbert,2002.0,pan,by james herbert,[0. 0. 0. ... 0. 0. 0.]
1764,0450411435,,stephen king,1996.5,trafalgar square,by stephen king,[0. 0. 0. ... 0. 0. 0.]
2049,0440216540,,rosellen brown,1993.0,dell,by rosellen brown,[0. 0. 0. ... 0. 0. 0.]
2370,0670813028,,stephen king,1986.0,viking books,by stephen king,[0. 0. 0. ... 0. 0. 0.]
3432,0449220605,,marge piercy,1993.0,fawcett books,by marge piercy,[0. 0. 0. ... 0. 0. 0.]
3522,0140115773,,bruce chatwin,1990.0,penguin books,by bruce chatwin,[0. 0. 0. ... 0. 0. 0.]
4585,0553279300,,jerzy kosinski,1985.0,bantam books,by jerzy kosinski,[0. 0. 0. ... 0. 0. 0.]
5360,0449216527,,john updike,1994.0,fawcett books,by john updike,[0. 0. 0. ... 0. 0. 0.]
5551,0849916836,,max lucado,2001.0,w publishing group,by max lucado,[0. 0. 0. ... 0. 0. 0.]
6173,0802136346,,jerzy kosinski,1999.0,grove press,by jerzy kosinski,[0. 0. 0. ... 0. 0. 0.]
