In [1]:
import pyspark
import random
import findspark

findspark.init()
sc = pyspark.SparkContext.getOrCreate()

In [2]:
import pandas as pd

from pyspark.sql import SQLContext, SparkSession
sparkSession = SparkSession.builder.getOrCreate()

In [3]:
books = sparkSession.read.csv('./BX-Books.csv', sep=';', header=True)
users = sparkSession.read.csv('./BX-Users.csv', sep=';', header=True)
books_ratings = sparkSession.read.csv('./BX-Book-Ratings.csv', sep=';', header=True)

In [5]:
books_ratings.show(5)

+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 276725|034545104X|          0|
| 276726|0155061224|          5|
| 276727|0446520802|          0|
| 276729|052165615X|          3|
| 276729|0521795028|          6|
+-------+----------+-----------+
only showing top 5 rows



In [6]:
type(books)

pyspark.sql.dataframe.DataFrame

In [7]:
pd.DataFrame(books.take(5))

Unnamed: 0,0,1,2,3,4,5,6,7
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [8]:
top_rated_books = books_ratings.groupBy('ISBN').count().orderBy('count', ascending=False)
top_rated_books.show(5)

+----------+-----+
|      ISBN|count|
+----------+-----+
|0971880107| 2502|
|0316666343| 1295|
|0385504209|  883|
|0060928336|  732|
|0312195516|  723|
+----------+-----+
only showing top 5 rows



In [9]:
pd.DataFrame(books.filter('ISBN in ("0971880107","0316666343", "0385504209", "0060928336", "0312195516")').take(5))

Unnamed: 0,0,1,2,3,4,5,6,7
0,971880107,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...,http://images.amazon.com/images/P/0971880107.0...
1,316666343,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...,http://images.amazon.com/images/P/0316666343.0...
2,312195516,The Red Tent (Bestselling Backlist),Anita Diamant,1998,Picador USA,http://images.amazon.com/images/P/0312195516.0...,http://images.amazon.com/images/P/0312195516.0...,http://images.amazon.com/images/P/0312195516.0...
3,385504209,The Da Vinci Code,Dan Brown,2003,Doubleday,http://images.amazon.com/images/P/0385504209.0...,http://images.amazon.com/images/P/0385504209.0...,http://images.amazon.com/images/P/0385504209.0...
4,60928336,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial,http://images.amazon.com/images/P/0060928336.0...,http://images.amazon.com/images/P/0060928336.0...,http://images.amazon.com/images/P/0060928336.0...


In [10]:
from pyspark.sql import functions as F

average_rating = books_ratings.groupBy('ISBN').agg(F.mean('book-rating'), F.count('book-rating')).orderBy('count(book-rating)', ascending=False).show(5)

+----------+------------------+------------------+
|      ISBN|  avg(book-rating)|count(book-rating)|
+----------+------------------+------------------+
|0971880107|1.0195843325339728|              2502|
|0316666343| 4.468725868725869|              1295|
|0385504209| 4.652321630804077|               883|
|0060928336| 3.448087431693989|               732|
|0312195516| 4.334716459197787|               723|
+----------+------------------+------------------+
only showing top 5 rows



In [20]:
user_activity = books_ratings.groupBy('User-ID').count().orderBy('count', ascending=False)
books_ratings = books_ratings.join(user_activity, "User-ID", 'left').filter("count >= 200").drop('count')


+-------+----------+-----------+
|User-ID|      ISBN|Book-Rating|
+-------+----------+-----------+
| 252827|0006512135|          0|
| 252827|000636988X|          0|
| 252827|0006479286|          0|
| 252827|0006345530|          0|
| 252827|0006713602|          0|
+-------+----------+-----------+
only showing top 5 rows



In [23]:
books_popularity = books_ratings.groupBy('ISBN').count().orderBy('count', ascending=False)
books_ratings = books_ratings.join(books_popularity, 'ISBN', 'left').filter("count >= 200").drop('count')

In [30]:
from pyspark.sql.types import DoubleType

books_ratings = books_ratings.withColumn('Book-Rating', books_ratings["Book-Rating"].cast(DoubleType()))

In [99]:
ratings_pivot = books_ratings.groupby('User-ID').pivot('ISBN').max('Book-Rating')
ratings_pivot = ratings_pivot.na.fill(0)

In [71]:
ratings_pivot.toPandas()

Unnamed: 0,User-ID,0060392452,0060502258,0060915544,0060928336,0060930535,0060934417,0060938455,0060959037,0060976845,...,0804114986,080411868X,0805063897,0842329129,0971880107,1400031354,1400034779,155874262X,1558743669,1573229326
0,131182,,,,,,,,,,...,,,,0.0,,,,,,
1,163202,,,,,,,,,,...,,,,0.0,,,,,,
2,223190,,,,0.0,,,,,,...,,,,9.0,,,,,,
3,32773,,,,0.0,,,,,,...,,,,,0.0,,,,,
4,230249,,,,,,,,,,...,,,,,0.0,,,0.0,,
5,104665,,,,0.0,,,,,,...,,,,,,,,0.0,,
6,108800,,,,,,,,,,...,,,,,,,,,,
7,250006,,,,5.0,,,,,,...,,,,,,,,,,
8,100227,,,,,,,,,,...,,,,,,,,,,
9,124717,,,,,,,,,,...,,,,,0.0,,,,,


In [72]:
ratings_pivot.select('0316666343').show()

+----------+
|0316666343|
+----------+
|      null|
|      null|
|      null|
|      null|
|      null|
|       0.0|
|      null|
|      null|
|       0.0|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      null|
|      10.0|
|      null|
|      null|
|      null|
+----------+
only showing top 20 rows



In [103]:
from pyspark.mllib.stat import Statistics
from pyspark.mllib.linalg import Vectors

rdd = ratings_pivot.rdd.map(lambda data: Vectors.dense([float(c) for c in data]))
particular_book_rdd = ratings_pivot.select('0316666343').rdd.map(lambda data: Vectors.dense([float(c) for c in data]))


AttributeError: 'PipelinedRDD' object has no attribute 'dtypes'

In [None]:
particular_book_rdd.dtypes

In [102]:
correlation = Statistics.corr(ratings_pivot.rdd, y=ratings_pivot.select('0316666343').rdd)

AttributeError: 'itertools.chain' object has no attribute 'map'

In [82]:
pd.DataFrame(correlation)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,186,187,188,189,190,191,192,193,194,195
0,1.000000,-0.007453,-0.002534,-0.009606,-0.009454,-0.006458,-0.001645,0.003903,-0.000927,-0.006867,...,0.000483,0.010109,-0.000474,-0.001234,-0.002132,-0.007801,0.005446,-0.021781,-0.010014,-0.000368
1,-0.007453,1.000000,0.009849,0.024846,0.008044,0.011420,0.008324,0.070826,0.001590,0.031089,...,0.045671,0.013104,0.025589,-0.001032,-0.010844,0.032073,0.033739,-0.005334,0.000279,0.011186
2,-0.002534,0.009849,1.000000,-0.006272,0.026420,0.008781,0.012796,0.003464,-0.006803,0.040965,...,0.008625,0.010180,-0.006687,0.024483,-0.001530,0.054312,0.037568,0.020546,0.008221,0.012075
3,-0.009606,0.024846,-0.006272,1.000000,0.030720,0.042599,0.047963,0.027231,0.071534,0.038753,...,0.001656,0.005887,0.007221,0.008728,-0.002529,0.064639,0.016847,0.036102,0.000502,-0.004940
4,-0.009454,0.008044,0.026420,0.030720,1.000000,0.054086,0.018543,0.006242,0.025085,0.122220,...,0.045606,0.028883,0.022019,0.026763,-0.007847,0.015229,0.015792,0.034449,0.010058,0.009369
5,-0.006458,0.011420,0.008781,0.042599,0.054086,1.000000,0.012273,0.001838,0.018777,0.038942,...,0.020892,0.009115,-0.003666,0.007051,0.000343,0.027236,0.030120,0.000985,0.022455,-0.002494
6,-0.001645,0.008324,0.012796,0.047963,0.018543,0.012273,1.000000,0.005283,0.032089,0.024526,...,0.024272,0.027223,0.034517,0.028572,-0.006721,0.080185,0.040723,0.000735,0.009368,0.041225
7,0.003903,0.070826,0.003464,0.027231,0.006242,0.001838,0.005283,1.000000,0.004534,0.003678,...,0.002987,-0.004316,0.040128,0.011242,-0.005040,0.023117,0.006985,0.025765,0.002951,0.007080
8,-0.000927,0.001590,-0.006803,0.071534,0.025085,0.018777,0.032089,0.004534,1.000000,0.015368,...,0.024788,-0.004092,0.016277,-0.006489,-0.008961,0.048534,0.007492,-0.005103,-0.004358,0.013117
9,-0.006867,0.031089,0.040965,0.038753,0.122220,0.038942,0.024526,0.003678,0.015368,1.000000,...,0.024041,0.041988,-0.007093,0.013840,-0.013280,0.019169,0.025556,0.040364,0.021410,0.034689


In [74]:
ratings_pivot.foreach(lambda x: print(x))