In [1]:
import _pickle as cPickle 
import scipy.sparse as ss
import scipy.io as sio
import numpy as np
user_index = cPickle.load(open("user_index.pkl", 'rb'))
event_index = cPickle.load(open("event_index.pkl", 'rb'))
user_event_scores = sio.mmread("user_event_response").todense()
user_sim_matrix = sio.mmread("similaritymatrix").todense()
event_prop_sim = sio.mmread("EV_eventPropSim").todense()
event_cont_sim = sio.mmread("EV_eventContSim").todense()
num_friends = sio.mmread("num_friends")
user_friends = sio.mmread("user_friends").todense()
event_popularity = sio.mmread("EA_eventPopularity").todense()
#print(user_event_scores)   
#FEATURE EXTRACTION:
#Now that we have stored all our data in a desirable format, we should extract our new features based on 
#various information collected so far and use these features in our final model.

#We use the following function to take each entry from our training dataset and convert it into a set of new features 
#and store it in a new dataset.We will later use this set of newly formed features as input to our model


In [2]:

import _pickle as cPickle
import scipy.io
import scipy.sparse as ss
import numpy as np


def userReco(user_id, event_id):
    """
    Collaborative filtering:-
    for item i
      for every other user v that has a preference for i
        compute similarity s between u and v
        incorporate v's preference for i weighted by s into running aversge
    return top items ranked by weighted average
    1. Get the colunm of scores given by all the users to the given event.
    2. Get the similarity(row matrix) of the user in query to all the users in the system(train and test dataset).
    3. Multipy the two matrices obtained to find out the user reocommender score for the event in query based on
    the preferences of all the users similar to the user in query.  
    """
    i = user_index[user_id]
    j = event_index[event_id]
    vs = user_event_scores[:, j]
    sims = user_sim_matrix[i, :]
    user_pref_score = sims * vs
    try:
        return user_pref_score[0, 0] - user_event_scores[i, j]
    except IndexError:
        return 0

def eventReco(user_id, event_id):
    """
    Content based:-
    for item i
      for every item j tht u has a preference for
        compute similarity s between i and j
        add u's preference for j weighted by s to a running average
    return top items, ranked by weighted average
    1. Get the row with all the event scores given by a ith user to all the event.
    2. Get the columns from the event-event similarity matrix based on metadata and and content.
    3. Multiply the row matrix in step 1 and column matrix in step two for the meta data based and content based seperately
    to obtain the event recommender score based on metadata and content.
    """
    i = user_index[user_id]
    j = event_index[event_id]
    js = user_event_scores[i, :]
    psim = event_prop_sim[:, j]
    csim = event_cont_sim[:, j]
    pprod = js * psim
    cprod = js * csim
    pscore = 0
    cscore = 0
    #print(cprod)
    try:
        pscore = pprod[0, 0] - user_event_scores[i, j]
    except IndexError:
        pass
    try:
        cscore = cprod[0, 0] - user_event_scores[i, j]
        #print(cscore)
    except IndexError:
        pass
    #print(pscore,cscore,user_id)
    
    return pscore, cscore


def userPop(user_id):
    """
    We use the matrix the matrix that was previously genrated to get the number of friends the user has.
    Measures user popularity by number of friends a user has. People
    with more friends tend to be outgoing and are more likely to go
    to events
    """
    if user_id in user_index:
        i = user_index[user_id]
        try:
            return num_friends[0, i]
        except IndexError:
            return 0
    else:
        return 0

def friendInfluence(user_id):
    """
    We use the user and friend influence matrix that was generated previously to get the influence measure
    of the friend on the user in query.
    Measures friends influence by the friends who are known (from the
    training set) to go or not go to an event. The average of scores across
    all friends of the user is the influence score.
    """
    nusers = np.shape(user_friends)[1]
    i = user_index[user_id]
    return (user_friends[i, :].sum(axis=0) /nusers)[0,0]

def eventPop(event_id):
    """
    Measures event popularity by the number attending, maybe(This number of the event record also depicts the interest of
     the user in the given query) and not attending.
    """
    i = event_index[event_id]
    return event_popularity[i, 0]


In [4]:
def getfeatures(start=1, train=True, head=True):
    if(train):
        file = "train.csv" 
    else:
        file="test.csv"
    #the input train is used to indicate whether it is a train or test data set.Feature extraction is done for both.
    fin = open(file, 'r')
    fout = open( "newdata/"+file, 'w')
    # the above code creates file descriptors for reading the dataset and writing the features into a new dataset
    if head:
        columns = ["invited", "user_reco", "evt_p_reco","evt_c_reco", "user_pop", "frnd_infl", "evt_pop"]
        if train:
            columns.append("interested")
            columns.append("not_interested")
        #because  the interested and not interested features are only present in our training set
        fout.write(",".join(columns) + "\n")
    #write these column names into the new data file and seperate them with a comma
    r = 0
    #for every line in the dataset : calculate new features and add to new dataset
    for each_line in fin:
        r += 1
        if r < start:
            continue
        #skip the first line
        cols = each_line.strip().split(",")
        user_id = cols[0]
        event_id = cols[1]
        invited = cols[2]
        #get the above attributes from dataset and use them along with the data stored in matrices to calculate features.
        user_reco = userReco(user_id, event_id)
        evt_p_reco, evt_c_reco = eventReco(user_id, event_id)
        user_pop = userPop(user_id)
        frnd_infl = friendInfluence(user_id)
        evt_pop = eventPop(event_id)
        new_columns = [invited, user_reco, evt_p_reco,evt_c_reco, user_pop, frnd_infl, evt_pop]
        if train:
            new_columns.append(cols[4]) # interested
            new_columns.append(cols[5]) # not_interested
        fout.write(",".join(map(lambda x: str(x), new_columns)) + "\n")
    fin.close()
    fout.close()
    
    

print("rewriting training data...")
getfeatures(train=True, start=2, head=True)
print ("rewriting test data...")
getfeatures(train=False, start=2, head=True)

rewriting training data...
rewriting test data...
