# Collaborative Filtering - Headphone Dataset

### Import Modules

In [2]:
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

"""
Collaborative Filtering Classification Example.
"""
from __future__ import print_function

from pyspark import SparkContext

# $example on$
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
# $example off$



### Read and Parse Data from Marketplace A - save in new file

In [4]:
import csv

firstTime = True
products = []
     
userCounter = 0

# Data Format in CSV: columns[userNum, productNum(1-46), Hiking, Museums...]

with open('ourData/marketplaceAFixMistake.csv', 'rb') as csvfile:
    marketplaceA = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in marketplaceA:
        if(firstTime): # remove first row with titles and create key chart
            for productNum in range (1,46):
                products.append(row[productNum])
            firstTime = False
            # print(products)
        else:
            for product in range (1,46):
                # print(row[product]) # all user products
                tempRating = [row[0], products[product-1], row[product]] # [userNum, productNum, Rating]
                with open("ourData/allDataFinal.csv", "a") as fp:   #write to new csv
                    wr = csv.writer(fp, dialect='excel')
                    wr.writerow(tempRating)

### Parse only certain people types! - filter by persona tags

In [5]:
import csv

firstTime = True
products = []
     
userCounter = 0
# 54 - Finding deals/ couponing , 58 - Fashion blogs / OOTD
# 53 - Coding, 56 - Foodie/ fine dining

# Data Format in CSV: columns[userNum, productNum(1-46), Hiking, Museums...]
testCounter = 0
with open('ourData/marketplaceAFixMistake.csv', 'rb') as csvfile:
    marketplaceA = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in marketplaceA:
        if(firstTime): # remove first row with titles and create key chart
            for productNum in range (1,46):
                products.append(row[productNum])
            firstTime = False
            # print(products)
        else:
            for product in range (1,46):
                # print(row[product]) # all user products
                if(row[53] == 'Coding' and row[56] == 'Foodie/ fine dining'):
                    tempRating = [row[0], products[product-1], row[product]] # [userNum, productNum, Rating]
                    # print(tempRating)
                    with open("ourData/foodiesAndCoding.csv", "a") as fp:   #write to new csv
                        wr = csv.writer(fp, dialect='excel')
                        wr.writerow(tempRating)

### Train Model on Data Set

In [5]:
if __name__ == "__main__":
    sc = SparkContext(appName="PythonCollaborativeFilteringExample")
    # $example on$
    # Load and parse the data
    data = sc.textFile("ourData/fashionAndDeals.csv")
    ratings = data.map(lambda l: l.split(','))\
        .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2])))

    # Build the recommendation model using Alternating Least Squares
    rank = 10
    numIterations = 10
    model = ALS.train(ratings, rank, numIterations)

    # Evaluate the model on training data
    testdata = ratings.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(testdata).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = ratings.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))

    # Save and load model
#     model.save(sc, "target/tmp/myCollaborativeFilter")
#     sameModel = MatrixFactorizationModel.load(sc, "target/tmp/myCollaborativeFilter")
    # $example off$

Mean Squared Error = 0.00075280203869


### Run Test Set and Calculate Predictions

In [6]:
    import numpy
    numpy.random.seed(7)

    # load test dataset
    dataset = numpy.loadtxt("ourData/fashionAndDealsTest.csv", delimiter=",")
    data = dataset[:,0:3]
    for x in range(0, len(data)):
        prediction = model.predict(data[x][0], data[x][1])
        print("your prediction is: ", prediction, " compared to the actual of: ", data[x][2])



your prediction is:  0.236488258385  compared to the actual of:  0.0
your prediction is:  0.372567283968  compared to the actual of:  1.0
your prediction is:  0.250441821435  compared to the actual of:  1.0
your prediction is:  0.174959825931  compared to the actual of:  0.0
your prediction is:  0.195660687861  compared to the actual of:  1.0
your prediction is:  0.233911381488  compared to the actual of:  0.0
your prediction is:  0.217385394727  compared to the actual of:  1.0
your prediction is:  0.388681641443  compared to the actual of:  1.0
your prediction is:  0.370575649831  compared to the actual of:  1.0
your prediction is:  0.339555905658  compared to the actual of:  1.0
your prediction is:  0.272667262926  compared to the actual of:  1.0
your prediction is:  0.0  compared to the actual of:  1.0
your prediction is:  0.185987427103  compared to the actual of:  0.0
your prediction is:  0.332815886396  compared to the actual of:  1.0
your prediction is:  0.0151556863951  compare

### Test Product Owner Categories 

In [43]:
import csv

firstTime = True
allProducts = []
     
# Data Format in CSV: columns[userNum, productNum(1-46), Hiking, Museums...]
testCounter = 0
with open('ourData/marketplaceAFixMistake.csv', 'rb') as csvfile:
    marketplaceA = csv.reader(csvfile, delimiter=',', quotechar='|')
    for productNum in range (1,46): # start empty array
        allProducts.append([productNum,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0])        
    for user in marketplaceA:
        for product in range (1,46):
            if(user[product] == '1'): # has product
                for tag in range(46,61):
                    if(user[tag] != ""):
                        allProducts[product-1][tag-45] += 1
                        
    with open("ourData/allProductRatings.csv", "a") as fp:   #write to new csv
        wr = csv.writer(fp, dialect='excel')
        for productRow in allProducts:
            wr.writerow(productRow)

### Reset Model

In [2]:
model = None
