# Collaborative Filtering - Headphone Dataset

### Import Modules

In [1]:
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
Collaborative Filtering Classification Example.
"""
from __future__ import print_function

from pyspark import SparkContext

# $example on$
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
# $example off$



### Read and Parse Data from Marketplace A

In [2]:
import csv

firstTime = True
products = []
     
userCounter = 0

# Data Format in CSV: columns[userNum, productNum(1-46), Hiking, Museums...]

with open('ourData/marketplaceAFixMistake.csv', 'rb') as csvfile:
    marketplaceA = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in marketplaceA:
        if(firstTime): # remove first row with titles and create key chart
            for productNum in range (1,46):
                products.append(row[productNum])
            firstTime = False
            # print(products)
        else:
            for product in range (1,46):
                # print(row[product]) # all user products
                tempRating = [row[0], products[product-1], row[product]] # [userNum, productNum, Rating]
                print(tempRating)
                with open("ourData/allDataFinal.csv", "a") as fp:   #write to new csv
                    wr = csv.writer(fp, dialect='excel')
                    wr.writerow(tempRating)

['1', '1', '0']
['1', '2', '1']
['1', '3', '0']
['1', '4', '0']
['1', '5', '0']
['1', '6', '0']
['1', '7', '1']
['1', '8', '0']
['1', '9', '0']
['1', '10', '0']
['1', '11', '1']
['1', '12', '0']
['1', '13', '1']
['1', '14', '0']
['1', '15', '0']
['1', '16', '0']
['1', '17', '0']
['1', '18', '1']
['1', '19', '0']
['1', '20', '0']
['1', '21', '1']
['1', '22', '0']
['1', '23', '0']
['1', '24', '0']
['1', '25', '0']
['1', '26', '0']
['1', '27', '0']
['1', '28', '1']
['1', '29', '0']
['1', '30', '1']
['1', '31', '0']
['1', '32', '1']
['1', '33', '0']
['1', '34', '1']
['1', '35', '0']
['1', '36', '0']
['1', '37', '0']
['1', '38', '1']
['1', '39', '1']
['1', '40', '0']
['1', '41', '0']
['1', '42', '1']
['1', '43', '0']
['1', '44', '0']
['1', '45', '1']
['2', '1', '0']
['2', '2', '1']
['2', '3', '0']
['2', '4', '1']
['2', '5', '1']
['2', '6', '0']
['2', '7', '1']
['2', '8', '0']
['2', '9', '1']
['2', '10', '0']
['2', '11', '1']
['2', '12', '0']
['2', '13', '1']
['2', '14', '0']
['2', '15', '0'

['19', '25', '0']
['19', '26', '0']
['19', '27', '0']
['19', '28', '0']
['19', '29', '0']
['19', '30', '0']
['19', '31', '1']
['19', '32', '0']
['19', '33', '0']
['19', '34', '0']
['19', '35', '0']
['19', '36', '0']
['19', '37', '0']
['19', '38', '0']
['19', '39', '0']
['19', '40', '0']
['19', '41', '1']
['19', '42', '0']
['19', '43', '0']
['19', '44', '0']
['19', '45', '0']
['20', '1', '0']
['20', '2', '1']
['20', '3', '1']
['20', '4', '0']
['20', '5', '0']
['20', '6', '0']
['20', '7', '0']
['20', '8', '0']
['20', '9', '1']
['20', '10', '0']
['20', '11', '0']
['20', '12', '0']
['20', '13', '0']
['20', '14', '0']
['20', '15', '0']
['20', '16', '0']
['20', '17', '0']
['20', '18', '0']
['20', '19', '1']
['20', '20', '1']
['20', '21', '1']
['20', '22', '0']
['20', '23', '0']
['20', '24', '1']
['20', '25', '1']
['20', '26', '1']
['20', '27', '1']
['20', '28', '0']
['20', '29', '1']
['20', '30', '0']
['20', '31', '0']
['20', '32', '0']
['20', '33', '1']
['20', '34', '0']
['20', '35', '0']
[

['39', '24', '1']
['39', '25', '0']
['39', '26', '0']
['39', '27', '0']
['39', '28', '0']
['39', '29', '0']
['39', '30', '0']
['39', '31', '1']
['39', '32', '0']
['39', '33', '0']
['39', '34', '0']
['39', '35', '0']
['39', '36', '0']
['39', '37', '0']
['39', '38', '0']
['39', '39', '1']
['39', '40', '0']
['39', '41', '0']
['39', '42', '0']
['39', '43', '1']
['39', '44', '1']
['39', '45', '0']
['40', '1', '0']
['40', '2', '1']
['40', '3', '0']
['40', '4', '0']
['40', '5', '0']
['40', '6', '0']
['40', '7', '0']
['40', '8', '0']
['40', '9', '1']
['40', '10', '0']
['40', '11', '0']
['40', '12', '1']
['40', '13', '0']
['40', '14', '0']
['40', '15', '0']
['40', '16', '0']
['40', '17', '0']
['40', '18', '0']
['40', '19', '1']
['40', '20', '0']
['40', '21', '0']
['40', '22', '0']
['40', '23', '0']
['40', '24', '1']
['40', '25', '0']
['40', '26', '0']
['40', '27', '0']
['40', '28', '0']
['40', '29', '0']
['40', '30', '0']
['40', '31', '0']
['40', '32', '0']
['40', '33', '0']
['40', '34', '0']
[

['58', '44', '1']
['58', '45', '1']
['59', '1', '0']
['59', '2', '1']
['59', '3', '0']
['59', '4', '0']
['59', '5', '0']
['59', '6', '0']
['59', '7', '0']
['59', '8', '0']
['59', '9', '0']
['59', '10', '0']
['59', '11', '0']
['59', '12', '1']
['59', '13', '0']
['59', '14', '0']
['59', '15', '0']
['59', '16', '0']
['59', '17', '0']
['59', '18', '0']
['59', '19', '0']
['59', '20', '0']
['59', '21', '1']
['59', '22', '0']
['59', '23', '0']
['59', '24', '0']
['59', '25', '0']
['59', '26', '1']
['59', '27', '1']
['59', '28', '0']
['59', '29', '1']
['59', '30', '0']
['59', '31', '1']
['59', '32', '1']
['59', '33', '0']
['59', '34', '1']
['59', '35', '0']
['59', '36', '0']
['59', '37', '0']
['59', '38', '1']
['59', '39', '0']
['59', '40', '0']
['59', '41', '0']
['59', '42', '0']
['59', '43', '1']
['59', '44', '1']
['59', '45', '0']
['60', '1', '0']
['60', '2', '0']
['60', '3', '1']
['60', '4', '0']
['60', '5', '0']
['60', '6', '0']
['60', '7', '0']
['60', '8', '0']
['60', '9', '0']
['60', '10

['83', '31', '0']
['83', '32', '0']
['83', '33', '0']
['83', '34', '0']
['83', '35', '0']
['83', '36', '1']
['83', '37', '1']
['83', '38', '0']
['83', '39', '0']
['83', '40', '0']
['83', '41', '1']
['83', '42', '0']
['83', '43', '1']
['83', '44', '1']
['83', '45', '0']
['84', '1', '0']
['84', '2', '0']
['84', '3', '1']
['84', '4', '1']
['84', '5', '0']
['84', '6', '0']
['84', '7', '0']
['84', '8', '0']
['84', '9', '0']
['84', '10', '0']
['84', '11', '0']
['84', '12', '0']
['84', '13', '0']
['84', '14', '0']
['84', '15', '0']
['84', '16', '0']
['84', '17', '0']
['84', '18', '0']
['84', '19', '0']
['84', '20', '0']
['84', '21', '0']
['84', '22', '0']
['84', '23', '0']
['84', '24', '0']
['84', '25', '0']
['84', '26', '0']
['84', '27', '0']
['84', '28', '0']
['84', '29', '0']
['84', '30', '0']
['84', '31', '0']
['84', '32', '0']
['84', '33', '1']
['84', '34', '0']
['84', '35', '0']
['84', '36', '0']
['84', '37', '0']
['84', '38', '0']
['84', '39', '0']
['84', '40', '1']
['84', '41', '1']
[

### Parse only certain people types!

In [5]:
import csv

firstTime = True
products = []
     
userCounter = 0
# 54 - Finding deals/ couponing , 58 - Fashion blogs / OOTD
# 53 - Coding, 56 - Foodie/ fine dining

# Data Format in CSV: columns[userNum, productNum(1-46), Hiking, Museums...]
testCounter = 0
with open('ourData/marketplaceAFixMistake.csv', 'rb') as csvfile:
    marketplaceA = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in marketplaceA:
        if(firstTime): # remove first row with titles and create key chart
            for productNum in range (1,46):
                products.append(row[productNum])
            firstTime = False
            # print(products)
        else:
            for product in range (1,46):
                # print(row[product]) # all user products
                if(row[53] == 'Coding' and row[56] == 'Foodie/ fine dining'):
                    tempRating = [row[0], products[product-1], row[product]] # [userNum, productNum, Rating]
                    # print(tempRating)
                    with open("ourData/foodiesAndCoding.csv", "a") as fp:   #write to new csv
                        wr = csv.writer(fp, dialect='excel')
                        wr.writerow(tempRating)

### Train Model on Data Set

In [2]:
if __name__ == "__main__":
    sc = SparkContext(appName="PythonCollaborativeFilteringExample")
    # $example on$
    # Load and parse the data
    data = sc.textFile("ourData/fashionAndDeals.csv")
    ratings = data.map(lambda l: l.split(','))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

    # Build the recommendation model using Alternating Least Squares
    rank = 10
    numIterations = 10
    model = ALS.train(ratings, rank, numIterations)

    # Evaluate the model on training data
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))

    # Save and load model
#     model.save(sc, "target/tmp/myCollaborativeFilter")
#     sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
    # $example off$

Mean Squared Error = 0.000801930816719


### Run Test Set and Calculate Predictions

In [3]:
    import numpy
    numpy.random.seed(7)

    # load test dataset
    dataset = numpy.loadtxt("ourData/fashionAndDealsTest.csv", delimiter=",")
    data = dataset[:,0:3]
    for x in range(0, len(data)):
        prediction = model.predict(data[x][0], data[x][1])
        print("your prediction is: ", prediction, " compared to the actual of: ", data[x][2])



your prediction is:  0.166887638465  compared to the actual of:  0.0
your prediction is:  0.731329659936  compared to the actual of:  1.0
your prediction is:  0.437556571185  compared to the actual of:  1.0
your prediction is:  0.402370892413  compared to the actual of:  0.0
your prediction is:  0.287300769743  compared to the actual of:  1.0
your prediction is:  0.168356868247  compared to the actual of:  0.0
your prediction is:  0.191841900456  compared to the actual of:  1.0
your prediction is:  0.397096058484  compared to the actual of:  1.0
your prediction is:  0.397547204196  compared to the actual of:  1.0
your prediction is:  0.31977036321  compared to the actual of:  1.0
your prediction is:  0.456423101302  compared to the actual of:  1.0
your prediction is:  0.0  compared to the actual of:  1.0
your prediction is:  0.18829981312  compared to the actual of:  0.0
your prediction is:  0.529921808463  compared to the actual of:  1.0
your prediction is:  0.0922000000841  compared 

### Test Product Owner Categories 

In [42]:
import csv

firstTime = True
allProducts = []
     
# Data Format in CSV: columns[userNum, productNum(1-46), Hiking, Museums...]
testCounter = 0
with open('ourData/marketplaceAFixMistake.csv', 'rb') as csvfile:
    marketplaceA = csv.reader(csvfile, delimiter=',', quotechar='|')
    for productNum in range (1,46): # start empty array
        allProducts.append([productNum,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])        
    for user in marketplaceA:
        for product in range (1,46):
            if(user[product] == '1'): # has product
                for tag in range(46,61):
                    if(user[tag] != ""):
                        allProducts[product-1][tag-45] += 1
                        
    with open("ourData/allProductRatings.csv", "a") as fp:   #write to new csv
        wr = csv.writer(fp, dialect='excel')
        for productRow in allProducts:
            wr.writerow(productRow)

[1, 13, 9, 13, 16, 11, 13, 5, 9, 8, 7, 9, 4, 6, 5, 7]
[26, 14, 12, 8, 19, 15, 23, 15, 7, 13, 4, 14, 3, 6, 4, 12]
[27, 14, 10, 9, 16, 13, 18, 9, 7, 10, 3, 12, 1, 6, 4, 9]


### Reset Model

In [2]:
model = None
