In [1]:
# general
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
from collections import Counter
import collections
from pandas import DataFrame
from scipy.cluster.hierarchy import linkage, dendrogram
# spark
from pyspark.ml.recommendation import ALS
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SQLContext, Row
# sk learn
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import matplotlib.pyplot as plt 
# similarity
from sklearn.metrics import jaccard_similarity_score

In [2]:
# personality scores
#data1 = pd.read_excel('data/pers_scores_1098.xlsx')
# articles by some traveler
#data2 = pd.read_excel('data/articles_159.xlsx')
# traveler profile
data3 = pd.read_excel('data/users_full_7034.xlsx')
# traveler and reviews
data4 = pd.read_excel('data/reviews_32618_for_1098_users_with_location.xlsx')

In [3]:
null_age = data3.ageRange.isnull()
null_gender = data3.gender.isnull()
null_style = data3.travelStyle.isnull()

In [4]:
filtered_data3 = data3[data3.totalPoints > 1000][~null_age][~null_gender][~null_style]


  """Entry point for launching an IPython kernel.


In [5]:
filtered_data3 = filtered_data3[['username', 'ageRange', 'gender', 'travelStyle']]
filtered_data3.head()

Unnamed: 0,username,ageRange,gender,travelStyle
1,0BKI0,25-34,female,"Foodie, Nature Lover, Urban Explorer"
7,14beacon,35-49,male,"Foodie, Trendsetter, Like a Local, Luxury Trav..."
12,1975mark,25-34,male,"Foodie, Beach Goer, Thrifty Traveller, Nightli..."
15,19Cam,25-34,male,"Vegetarian, Urban Explorer, Art and Architectu..."
17,1Angie,18-24,female,"Beach Goer, Vegetarian, Urban Explorer"


In [6]:
filtered_data4 = data4[['id', 'username', 'type', 'text', 'rating', 'taObjectCity']]
filtered_data4;


In [7]:
attraction_only = filtered_data4.type == 'Attractions'
filtered_data4 = filtered_data4[attraction_only]

In [8]:
filtered_data4.head()

Unnamed: 0,id,username,type,text,rating,taObjectCity
2,3,007solotraveler,Attractions,Great Museum - abslutely worth making the time...,5,Stockholm
87,88,124_10,Attractions,Noting to dislike about this village. It has e...,5,Corsham
95,96,12ReasonsWhy,Attractions,We were lucky enough to be the only two people...,5,Takayama
102,103,14beacon,Attractions,"Went with my wife, parents, two kids (7) & (12...",5,Charleston
103,104,14beacon,Attractions,My family and I just came back from spending a...,4,Farmingdale


In [9]:
data3_merge_4 = pd.merge(filtered_data4, filtered_data3, on=['username'])
data3_merge_4.head()

Unnamed: 0,id,username,type,text,rating,taObjectCity,ageRange,gender,travelStyle
0,103,14beacon,Attractions,"Went with my wife, parents, two kids (7) & (12...",5,Charleston,35-49,male,"Foodie, Trendsetter, Like a Local, Luxury Trav..."
1,104,14beacon,Attractions,My family and I just came back from spending a...,4,Farmingdale,35-49,male,"Foodie, Trendsetter, Like a Local, Luxury Trav..."
2,362,19Cam,Attractions,The box is only a temporary exhibit while cons...,4,Berlin,25-34,male,"Vegetarian, Urban Explorer, Art and Architectu..."
3,363,19Cam,Attractions,"A large collection, presented in a stunning bu...",3,Berlin,25-34,male,"Vegetarian, Urban Explorer, Art and Architectu..."
4,378,19Cam,Attractions,"After reading all the hype about this show, I ...",5,London,25-34,male,"Vegetarian, Urban Explorer, Art and Architectu..."


In [10]:
span_mask1 = (data3_merge_4.username == 'AnaS1')
span_mask2 = (data3_merge_4.username == 'DaniLK')
span_mask3 = (data3_merge_4.username == 'Aprile_24')
non_city_mask = (data3_merge_4.taObjectCity == 'California')

In [11]:
data3_merge_4 = data3_merge_4[~span_mask1][~span_mask2][~span_mask3][~non_city_mask]

  """Entry point for launching an IPython kernel.


In [12]:
# cities with more than 4 reviews
popular_city = []
for item, value in Counter(data3_merge_4.taObjectCity).items():
    if value > 7:
        popular_city.append(item)
print(len(popular_city))

56


In [13]:
final_df = data3_merge_4[data3_merge_4.taObjectCity.isin(popular_city)]
final_df.head()

Unnamed: 0,id,username,type,text,rating,taObjectCity,ageRange,gender,travelStyle
2,362,19Cam,Attractions,The box is only a temporary exhibit while cons...,4,Berlin,25-34,male,"Vegetarian, Urban Explorer, Art and Architectu..."
3,363,19Cam,Attractions,"A large collection, presented in a stunning bu...",3,Berlin,25-34,male,"Vegetarian, Urban Explorer, Art and Architectu..."
4,378,19Cam,Attractions,"After reading all the hype about this show, I ...",5,London,25-34,male,"Vegetarian, Urban Explorer, Art and Architectu..."
5,414,19Cam,Attractions,Sited within the main botanical gardens area o...,4,Kuala Lumpur,25-34,male,"Vegetarian, Urban Explorer, Art and Architectu..."
18,778,1oldseagull,Attractions,"First thing, this visiter center was not easy ...",5,Chattanooga,65+,male,"Foodie, Beach Goer, History Buff, 60+ Traveler..."


In [14]:
final_df.shape

(963, 9)

# ALS model
## create spark dataframe

In [15]:
# Build our Spark Session and Context

spark = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext
spark, sc
sqlContext = SQLContext(sc)

In [63]:
temp_rdd = final_df[['username', 'taObjectCity', 'rating']]


In [64]:
# dictionary contains unique value for each user
user_dict = {}
for idx, user in enumerate(temp_rdd.username.unique()):
    user_dict[user] = idx
    

    
user_id_list = [user_dict[item] 
                  for user in temp_rdd.username for item, key in user_dict.items() 
                  if item == user]
print(user_id_list)
print(temp_rdd.head())
temp_rdd.loc[:, 'user_id'] = pd.Series(user_id_list)
print(temp_rdd.head()) 
#print('total unique users:', len(user_dict))

[0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 6, 7, 7, 7, 7, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, 15, 16, 16, 16, 17, 18, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, 20, 21, 22, 22, 22, 23, 23, 23, 23, 24, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 26, 26, 26, 26, 26, 26, 26, 27, 27, 27, 27, 27, 27, 27, 28, 29, 29, 29, 29, 29, 29, 29, 30, 30, 31, 31, 31, 32, 32, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 33, 34, 34, 34, 35, 35, 35, 35, 36, 36, 36, 36, 36, 36, 36, 36, 37, 38, 38, 38, 39, 40, 41, 42, 42, 42, 42, 42, 42, 42, 43, 43, 43, 43, 43, 43, 43, 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [42]:
temp_rdd.head()

Unnamed: 0,username,taObjectCity,rating,user_id
2,19Cam,Berlin,4,0
3,19Cam,Berlin,3,0
4,19Cam,London,5,0
5,19Cam,Kuala Lumpur,4,0
18,1oldseagull,Chattanooga,5,1


In [33]:
user_dict

{'19Cam': 0,
 '1oldseagull': 1,
 2093: 2,
 '23stationroad': 3,
 '29grapes': 4,
 '2toots': 5,
 422: 6,
 '4bigdogsCalgary': 7,
 '6079max': 8,
 678855: 9,
 'AFamilyYorkshire': 10,
 'AHM': 11,
 'AJBoston5996': 12,
 'ALiEN2108': 13,
 'ASN1972': 14,
 'ASWh': 15,
 'ATP1': 16,
 'AV_Surrey': 17,
 'AdrianandLoretta': 18,
 'Aelwyn': 19,
 'AffairsinMiami': 20,
 'AfricanHeather': 21,
 'AhmetD': 22,
 'Akramdash': 23,
 'AlabamaTraveler0': 24,
 'Alanis628': 25,
 'Alfanumeric': 26,
 'AliDUk': 27,
 'Alkeshmehta': 28,
 'AlvinTSingapore': 29,
 'Amber9': 30,
 'Andrea63': 31,
 'Anecdotal': 32,
 'Angela075': 33,
 'AngelaBax': 34,
 'AngelenePenguin': 35,
 'AngusAsh': 36,
 'Anrmd': 37,
 'Antoinette78': 38,
 'Antsy280': 39,
 'AradhanaVerma': 40,
 'Armymed': 41,
 'ArtRussianMom': 42,
 'Arthur010': 43,
 'AspiringFoodie': 44,
 'Auli51': 45,
 'Aussie_Andrew': 46,
 'Aussienando': 47,
 'Averbuch': 48,
 'Awful_Kgp': 49,
 'BMWM5Driver': 50,
 'BabsLimerick_Ireland': 51,
 'Backfixer': 52,
 'Badgerjim': 53,
 'Baidoo': 54,

In [18]:
temp_rdd.head()

Unnamed: 0,username,taObjectCity,rating,user_id
2,19Cam,Berlin,4,0
3,19Cam,Berlin,3,0
4,19Cam,London,5,0
5,19Cam,Kuala Lumpur,4,0
18,1oldseagull,Chattanooga,5,1


In [19]:
# user_id_list = [user_dict[item] 
#                   for user in temp_rdd.username for item, key in user_dict.items() 
#                   if item == user]


# temp_rdd['user_id'] = user_id_list

In [20]:
# dictionary contains unique value for each city
city_dict = {}
for idx, city in enumerate(temp_rdd.taObjectCity.unique()):
    city_dict[city] = idx
print('total unique cities:', len(city_dict))    

total unique cities: 56


In [21]:
city_id_list_2 = [city_dict[item] for city in temp_rdd.taObjectCity 
                  for item, key in city_dict.items() if item == city]

In [22]:
#temp_rdd['user_id'] = user_id_list
temp_rdd['city_id'] = city_id_list_2
temp_rdd['rating_float'] = pd.to_numeric(temp_rdd.rating, downcast='float')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [23]:
final_rdd = temp_rdd[['user_id', 'city_id', 'rating_float']]

In [24]:
util_matrix.head()

NameError: name 'util_matrix' is not defined

In [None]:
schema = StructType( [ 
    StructField('user', IntegerType(), True),
    StructField('city', IntegerType(), True),
    StructField('rating', FloatType(), True)]
)

In [None]:
# creating spark dataframe
#spark_df = sqlContext.createDataFrame(final_rdd, schema)

spark_df = sqlContext.createDataFrame(util_matrix, schema)


In [None]:
train, test = spark_df.randomSplit([0.85, 0.15], seed=427471138)

In [None]:
test.show()

In [None]:
als_model = ALS(userCol='user',
                itemCol='city',
                ratingCol='rating',
                nonnegative=True,
                regParam=0.1,
                rank=15
               )

In [None]:
als_recommender = als_model.fit(train)
als_recommender

In [None]:
predictions = als_recommender.transform(test)


In [None]:
predictions.show(20)

In [None]:
predictions[(predictions.user == '18')].show()

In [None]:
pred = predictions[(predictions.user == '18') & (predictions.city == '20')].select("prediction").collect()

In [None]:
pred[0][0]

In [None]:
from sklearn.metrics import mean_squared_error
df_pred = predictions.toPandas()
pred = df_pred.prediction

actual = df_pred.rating
mse = mean_squared_error(actual, pred.fillna(pred.median()))
print("ALS model''s MSE: ", np.sqrt(mse))

In [None]:
#df_pred.head()

# Hybrid model
## user-only matrix

In [None]:
feature_temp = final_df[['username', 'ageRange', 'gender', 'travelStyle']]
feature_temp.head()

In [None]:
style_lst = [item.split(', ') for item in feature_temp.travelStyle]
feature_temp['new_travel'] = style_lst


In [None]:
style_matrix = feature_temp['new_travel'].apply(pd.Series) # 19 styles
style_df = pd.get_dummies(style_matrix.apply(pd.Series).stack()).sum(level=0).rename(columns = lambda x : x)

In [None]:
right = style_df

In [None]:
left = feature_temp

In [None]:
feature_temp = left.join(right)

In [None]:
feature_temp = pd.get_dummies(feature_temp, columns = ['ageRange', 'gender'])

In [None]:
feature_temp = feature_temp.drop(['travelStyle', 'new_travel', 'gender_male','60+ Traveler'], axis =1)

In [None]:
feature_temp.head()

In [None]:
feature_final = feature_temp.drop_duplicates()


In [None]:
feature_final = feature_final.drop(['username'], axis =1) 


In [None]:
feature_final.reset_index(drop=True, inplace=True)

In [None]:
feature_final.head()

In [None]:
invert_feature = feature_final.T

In [None]:
invert_feature.head()

In [None]:
jaccard_similarity_score(invert_feature[0], invert_feature[1])

In [None]:
#final_rdd.groupby(['city_id', 'user_id'])
final_rdd.head()

In [None]:
# df = final_rdd.copy()
# final_rdd.groupby(['city_id', 'user_id']).rating_float.mean()

In [None]:
agg_dict = {'rating_float':'median'}
util_matrix = final_rdd.groupby(['user_id','city_id']).agg(agg_dict).reset_index()
util_matrix.head(10)

In [None]:
def get_rating(df,uid,cid):
    try:
        return df[(df.user_id == uid) & (df.city_id == cid)].rating_float.values[0]
    except:
        return None

In [None]:
x = get_rating(util_matrix,0,34)
print(x)

In [None]:
abc = util_matrix[util_matrix.city_id == 5]
abc

In [None]:
for i in abc.user_id.values:
    print (i)
    

In [None]:
def jaccard_sim_score(udi, cid, user_matrix, util_matrix):
    ''' 
    takes in user(index) and item
    returns jaccard similarity score
    '''
    overall_rating = 0
    overall_sim = 0
    final_score = 0
    
    filtered_user = util_matrix[util_matrix.city_id == cid]
    print(filtered_user)
    for user in filtered_user.user_id.values: 
        sim_score = jaccard_similarity_score(invert_feature[udi], invert_feature[user])
        rating = filtered_user[(filtered_user.user_id == user)].rating_float.values[0]
        overall_rating += sim_score * rating
        overall_sim +=sim_score


    final_score = overall_rating / overall_sim
        
    return final_score

In [None]:
user_i = 18
item = 20 
user_matrix = invert_feature
utility_matrix = util_matrix
jaccard_sim_score(user_i, item, invert_feature, util_matrix)

In [None]:
final = (3.626964569091797 + 4.084) / 2

In [None]:
final

In [None]:
# class similarity():
    
#     def __init__():
        
#         self.
        
        
#     def _similarity():
        
        
        
        
        

In [None]:
#df_grouped.describe().T

In [None]:
#df_combined = pd.DataFrame(data = df_grouped.rating_float, index = df_grouped.user_id, columns = df_grouped.city_id )

In [None]:
#df_combined.head()

In [None]:
df_combined = df_combined.groupby(['user_id']).sum()

In [None]:
df_combined.index[1]