In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import re

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import *

import warnings; warnings.simplefilter('ignore')

In [7]:
# HELPING FUNCTIONS

stemmer = PorterStemmer()

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def clean_(sen):
    
    sen = re.sub(r'[^a-zA-Z\.0-9 ]+', '', sen) 
    sen = sen.lower()
    sen = sen.split()
#     sen = [lemmatize_stemming(x) for x in sen]
    sen = ' '.join(sen)
    
    return sen



def get_events(path):
    metadata = pd.read_excel(path)
    metadata['Type']=[ x.lower() for x in metadata['Type']]
    
    eventname= metadata['Event name'].fillna('')
    description = (metadata['Event time'] + metadata['Venue']+metadata['Description']+metadata['Source']+metadata['Type']+metadata['City']).fillna('')
    description_clean= list(description)

    i=0
    while(i< len(description)):
        description_clean[i]= clean_(description[i])
        i+=1
    
    #improve Description
    metadata['Description']=description_clean
    return metadata

    

def first_ele(x):
    return x[1]

# def get_recommendations(title,headings,mapping_,n=10):
#     """
#     Finds 10 most similar events using the pairwise cosine similarity.
#     """
#     index = (mapping_[title])
# #     print(index)
    
#     simi = list((cosine_similarity[index]))
#     indexes = list(range(len(simi)))
#     simi= list(zip(indexes,simi))
# #     print (sim_scores)
    
#     simi = sorted(simi, key=first_ele, reverse=True)
    
# #     ignorning first= itself
#     simi=simi[1:]
    
#     recommendations = metadata.iloc[[i[0] for i in simi]][headings]
    
#     return recommendations.head(n)


def get_recommendations_pop(metadata,title,mapping): #popularity based context
    
    index = (mapping[title])
    simi = list((cosine_similarity[index]))
    indexes = list(range(len(simi)))
    simi= list(zip(indexes,simi))
    simi = sorted(simi, key=first_ele, reverse=True)
    
    
    simi = simi[1:60]
    
    #calculate vote_count and avergare before doing this
    recommendations = metadata.iloc[[i[0] for i in simi]][['title', 'vote_count', 'vote_average']]
    
    items = metadata.iloc[[i[0] for i in simi]]
    
    vote_counts = items[items['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = items[items['vote_average'].notnull()]['vote_average'].astype('float')
    
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    
    
    recommendations = items[(items['vote_count'].notnull()) & (items['vote_average'].notnull()) & (items['vote_count'] >= m)]
    
    recommendations['vote_count'] = recommendations['vote_count'].astype('int')
    recommendations['vote_average'] = recommendations['vote_average'].astype('float')
    
    recommendations['weighted_rating'] = recommendations.apply(weighted_rating,args=(C,m), axis=1)
    recommendations = recommendations.sort_values('weighted_rating', ascending=False)
#     print (recommendations.head(2))
    recommendations=recommendations[['title', 'vote_average']]
    recommendations=recommendations.head(10)
    return recommendations


def weighted_rating(x,C,m):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)



def recommend_eventss(userID, preds_matrix , items_matrix, rating_matrix, n=10):
    
    
    user_predictions = preds_matrix.iloc[userID - 1 ]
    
    indexes= rating_matrix.userId == userID
    user_rating = rating_matrix[indexes]
    
    ratings = user_rating.merge(items_matrix, how = 'left', left_on = 'itemId', right_on = 'ItemID')
    ratings=ratings.sort_values('rating', ascending = False)
    
    items_matrix=items_matrix.dropna()

    recommendations = items_matrix.merge(pd.DataFrame(user_predictions), how = 'left',left_on = 'ItemID',right_on = 'itemId')
   
    recommendations=recommendations.drop(columns=['Genres'])
    recommendations=recommendations.sort_values(userID - 1, ascending = False)
    recommendations=recommendations.rename(columns = {userID - 1: 'Predictions'})
                    

    return recommendations.head(n)



In [8]:
m = pd.read_csv('movies/movies.csv')
m

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [9]:
#APPLICATION DATA

df_users = pd.DataFrame(columns= ['userId','userName'])
df_events = get_events("activities.xls")
df_events['eventId'] = m['movieId']


df_rating = pd.read_csv("movies/ratings.csv")
df_rating.rename(columns={'movieId':'eventId'},inplace= True)


In [10]:
df_rating.head(2)

Unnamed: 0,userId,eventId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [11]:
df_events.head(2)

Unnamed: 0,Event name,Event time,Venue,Description,Source,Picture,Type,City,Rated,eventId
0,6 Days Tour to Hunza & Khunjrab. 10_15 Dec.,\t\t\t\t\t\tTue Dec 10 2019 at 03:00 am to Sun...,"Tour De Nature, Office No # 07, Plot No #24, ...",tue dec 10 2019 at 0300 am to sun dec 15 2019 ...,Tour De Nature,https://cdn-az.allevents.in/events1/banners/c6...,entertainment,islamabad,False,1
1,Girls Rising - HR Reel Fest 2019,\t\t\t\t\t\tTue Dec 10 2019 at 11:00 am to 02:...,"UN Information Centre, ILO Building, G-5/2, I...",tue dec 10 2019 at 1100 am to 0200 pm un infor...,Human Rights Reel Film Festival,https://cdn-az.allevents.in/events8/banners/0d...,entertainment,islamabad,False,2


In [12]:
#knn based similar users
def get_similar_users(input_user_id,df_rating,df_events,n= 6):
    #use recent please
    new_ratings = pd.merge(df_rating,df_events[['Event name','eventId']],left_on='eventId',right_on='eventId')

    #zero means that user has never voted for this item.
    rating_matrix = new_ratings.pivot(index='userId',columns='eventId',values='rating').fillna(0)
    print(rating_matrix)
    knn = NearestNeighbors(metric='cosine',algorithm='brute')
    knn.fit((rating_matrix.as_matrix()))
    
    userid = input_user_id
    
    similar_user =[]
    similar_user.append(userid)
    
    
 
    # (excluded itself)
    value = rating_matrix.iloc[userid].values
    value = value.reshape(1,-1)
    distances, indices = knn.kneighbors(value,n_neighbors=n) 
    distances=distances.flatten()
    indices=indices.flatten()
    
    print("Five Nearest Neighbors of user{0}:".format(userid))


    for i in range(1,len(distances)):
        similar_user.append(rating_matrix.index[indices[i]])
    return similar_user

#get_df( df_new_events,df_rating, df_user_rating, similar_users_id, input_user_id )
def get_df(df_new_events,df_rating,df_user_rating,similar_users_id,input_user_id):
    
    pass
    


In [None]:
from scipy.sparse.linalg import svds


min_active_users = 10000000
input_user_id =837


def system(input_user_id,df_new_events, df_old_events,df_rating):
    
    #new = future events not rated by this user
    #old = future + past events rated by this user
    
    df_user_rating = df_rating[df_rating['userId']==input_user_id]
    
    
    #active users
    app_active_users = []
    
    for n_e in range(len(df_new_events)):
        if df_new_events.iloc[n_e]['eventId'] in df_rating['eventId']:
            app_active_users.extend( list(df_rating[df_rating['eventId'] == df_new_events.iloc[n_e]['eventId'] ]['userId'])  )
          

    #Post cold start        
    if len(np.unique(np.array(app_active_users)))> min_active_users:
        
        
        #get events according to subset of users
        #df_subset_events = get_subset( app_active_users, df_old_events)
        
        
        df_subset_ratings = df_rating[df_rating.userId.isin(np.array(app_active_users))]
        similar_users_id = get_similar_users(input_user_id,df_subset_ratings, df_old_events)
        similar_users_id = similar_users_id.append(input_user_id)
        
    
        #get futute events+ those past events ratings that are also recommneded by similar users and input user (recent)
        df_rating_similar_users = get_df( df_new_events,df_rating, df_user_rating, similar_users_id, input_user_id )
        
        
        rating_matrix = df_rating_similar_users.pivot(index = 'userId', columns ='eventId', values = 'rating').fillna(0)
        
        
        R = rating_matrix.as_matrix()
        mean_ =(np.mean(R, axis = 1)).reshape(-1, 1)
        
        U, sigma, E = svds(R - mean_, k = 50)
        sigma = np.diag(sigma)

        dot_prod =np.dot(np.dot(U, sigma),E)
        ratings_pred = dot_prod  + mean_
        
        preds_matrix = pd.DataFrame(ratings_pred, columns = rating_matrix.columns)
        predictions = recommend_eventss(input_user_id ,preds_matrix, df_new_events, df_rating_similar_users)
        
        return predictions
    
    #cold Start for new events (item)
    else:
        

        # Content Based filtering
    
        frames = [df_old_events, df_new_events]
        result = pd.concat(frames)
  


        description = result['Description']
        tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
        tfidf = tf.fit_transform(description)
        
        cosine_similarity = linear_kernel(tfidf, tfidf)

        titles = result['eventId']
        indexes= list(range(len(result)))
        mapping_ = pd.Series(indexes, index=titles) 
    
        
        ind =[]
        
        for new in list(df_new_events['eventId']):
            ind.append(mapping_[new])
            
        recommendations_name =[]
        
        for past_liked in list(df_old_events['eventId']):
            
            index = mapping_[past_liked]
            
            
            simi = list((cosine_similarity[index]))
            indexes = list(range(len(simi)))
            
            simi= list(zip(indexes,simi))
            simi = sorted(simi, key=first_ele, reverse=True)

            for i,s in simi:
                if i in ind:
                    temp = ind.index(i)
                    recommendations_name.append(list(df_new_events['Event name'])[temp])
                    print(list(df_new_events['eventId'])[temp])
                    break
 
        return recommendations_name




df_users = pd.DataFrame(columns= ['userId','userName'])

df_events = get_events("data/activities.xls")
df_events.drop_duplicates( keep="first", inplace=True)

df_events['eventId'] = np.arange(len(df_events))


df_rating = pd.read_csv("data/ratings.csv")
df_rating.rename(columns={'movieId':'eventId'},inplace= True)


system(2,df_events[:-10],df_events[-10:],df_rating)

In [None]:
df_events[:-10]