In [1]:
# To be optimize: Last get house may not be changed much.
# Can save some intermediate scores to the database so that don't need to calculate all scores again.

In [2]:
import numpy as np
import pandas as pd
import math
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from lightgbm import LGBMClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.cluster import KMeans

In [31]:
# this cell contains methods that can be reused.
def save_to_database(database, index_to_save):
    pass

def get_top_num(vec, num):
    # assuming num is small, we use a O(n*num) solution instead of a full sort
    index_arr = [] # stores the index we have get 
    print(vec)
    for i in range(num):
        current_best = -math.inf
        current_best_index = 0
        for j in range(len(vec)):
            if(j not in index_arr):
                if(vec[j] > current_best):
                    current_best = vec[j]
                    current_best_index = j
        index_arr.append(current_best_index)
    return index_arr

In [34]:
# cold start model
# return this user's preference features and other users' features matrix
def read_from_database_cold(user, database):
    
    pass

# calculate cosine similarity between a vector and all columns of a matrix
def get_similarity_cold(vec, mat):
    dot = np.dot(mat,vec)
    norm_product = np.linalg.norm(vec, ord = 2) * (np.sum(mat ** 2,axis=1)**(1./2))
    norm_product = np.reshape(norm_product,(len(norm_product),1))
    sim_vec = dot / norm_product
    sim_vec = np.reshape(sim_vec,(len(sim_vec),))
    return sim_vec
        
# main function for cold start model, num is the number of posts to be posted
def get_house_cold(user, num):
    database = " "
    user, others = read_from_database_cold(user, database)
    sim_vector = get_similarity_cold(user, others)
    index_arr = get_top_num(sim_vector, num)
    save_to_database(database, index_arr)

def encoding():
    pass

In [39]:
# test for cold start, after encoding stage

user_test_person_1 = np.array([0.3,0.2,0.95])  
user_test_preference_1 = np.array([0.25,0.3,0.2])

house_test_person_1 = np.array([0.2,0.4,0.7])
house_test_preference_1 = np.array([0.1,0.7,0.3])

house_test_person_2 = np.array([0.3,0.2,0.95])
house_test_preference_2 = np.array([0.25,0.3,0.2])

house_test_person_3 = np.array([0.1,0.7,0.2])
house_test_preference_3 = np.array([0.7,0.8,0.9])

user_1 = np.concatenate((user_test_person_1,user_test_preference_1))
user_1 = np.reshape(user_1,(len(user_1),1))
house_1 = np.concatenate((house_test_person_1,house_test_preference_1))
house_2 = np.concatenate((house_test_person_2,house_test_preference_2))
house_3 = np.concatenate((house_test_person_3,house_test_preference_3))
house_mat = np.array([house_1,house_2,house_3])

sim_vec = get_similarity_cold(user_1, house_mat)
print(get_top_num(sim_vec,3))

[0.87845528 1.         0.54791033]
[1, 0, 2]


In [None]:
# Test for cold start
user_test_person_1 = {
    "gender": "male",
    "major" : "Computer Science",
    "age"   : 20,
}

user_test_preference_1 = {
    'M':{
    "bedroom_want" = 1,
    "payment_want" = "500-1000",
    "laundary_want" = "in unit",
    "parking_want" = "garage",
    "other_feature_want" = ["hard wood", "dish washer","air conditioning"],
    "pets_want" = "dogs"
    }
    
}

house_test_1 = {
    
}
house_test_2 = {}

In [4]:
#THIS CALCULATION MAY BE FASTER IF VECTORIZED. DEPENDS ON EXPERIMENT#

# collaborative filter model
# return this use and other uses as dictionary(ies).

def read_from_database_collab(user, database):
    pass
    
# this calculates this user and other users' similarity based on posts they viewed. These weights are used to weight the scores of houses
def get_user_similarity_collab(vec,others):
    scores = []
    norm_user = np.linalg.norm(vec.values(), ord = 2)  # norm of this user
    user_set = set(vec.keys())
    for i in range(len(others)):
        cur_dict = others[i]  # current dictionary, representation of the current other user being processed.
        norm_cur = np.linalg.norm(cur_dict.values(),ord = 2)
        
        # get the common elements
        cur_set = set(cur_dict.keys())
        common = user_set & cur_set
        
        score = [vec[c]*cur_dict[c] for c in common]
        score /= (norm_user * norm_cur)
        scores.append(score)  # append current score to the scores list
    return scores

# main function for collaborative filter, num is the number of posts to be posted
def get_house_collab(user, num):
    database = " "
    user, others = read_from_database_collab(user, database)
    sim_vector = get_user_similarity_collab(user,others)
    index_arr = get_top_num(sim_vector, num)
    save_to_database(database,index_arr)

In [5]:
# now that we have two models, Assume we have a threshold t, when user has pass through this threshold, we need to merge two models together
# return the weight alpha and the time user viewed posts last time and
# the posts information as a dict of list as values, ID as keys, which stores the time users view the post.
#(Only posts get recommended last time). also, return who recommends these items 

def read_from_database_two(user, database):
    return alpha, user_time, all_time, cold_recommend, collab_recommend

def save_to_database_two(database, index_to_save, cold_recommend, collab_recommend, alpha):
    pass

# fit Gaussian to return the new alpha
def update_alpha(alpha, user_time, all_time, cold_recommend, collab_recommend, learning_rate = 0.1):
    # all_time is a dict with ID as the key and time as the values
    keys = all_time.keys()
    num_posts = len(keys)
    p = np.zeros(len(keys))  # p stores the percentage of corresponding posts
    
    for i in range(num_posts):
        cur_all_time = all_time[i]
        cur_user_time = user_time[i]
        
        # compute the Gaussian parameters
        mean = np.mean(cur_all_time)
        var = np.var(cur_all_time)
        p[i] = (cur_user_time - mean) / var
    
    cold_weight = 0
    collab_weight = 0
    cold_keys = cold
    for i in range(len(p)):
        if(keys[i] in cold_recommend):
            cold_weight += p[i]
        if(keys[i] in collab_recommend):
            collab_weight += p[i]
            
    # Simply use the ratio for temp_alpha
    temp_alpha = cold_weight / collab_weight
    # interpolate to get the new alpha
    new_alpha = alpha + learning_rate * (temp_alpha - alpha)
    
    return new_alpha
        
        
# main function for merging two algorithms
def get_house_two(user, num):
    database = " "
    alpha, user_time, all_time, cold_recommend, collab_recommend = read_from_database_two(user, database)
    
    # update alpha first and then use this alpha to recommend posts this time.
    alpha = update_alpha(alpha,user_time,all_time, cold_recommend, collab_recommend)
    
    num_cold = int(alpha * num) + 1
    num_collab = num - num_cold
    
    array_ind_1 = cold_get_house(user, num)
    array_ind_2 = get_house_collab(user, num)
    
    array_ind = np.asarray(list(set(array_ind_1 + array_ind_2)))
    np.random.shuffle(array_ind)
    array_ind = array_ind[:num]
    
    save_to_database_two(database, array_ind, array_ind_1, array_ind_2, alpha)

In [16]:

# cell saved for matrix factorization. HOW TO APPLY MATRIX FACTORIZATION FOR ONLINE LEARNING???


In [9]:
# community detection and  classification two-stage approach
def read_from_database_community_clf(database):
    pass

def save_to_database_community_clf(database):
    pass

def community_update():
    pass

# update cv later
def classification(X_train, y_train, X_test,clf = "logistic"):
    clf = None
    if(clf == "logistic"):
        clf = LogisticRegression()
    elif(clf == "svm"):
        clf = LinearSVC()
    elif(clf == "gbm"):
        clf = LGBMClassifier()
    
    clf.fit(X_train, y_train)
    model = CalibratedClassifierCV(clf, cv='prefit')
    model.fit(X_train, y_train)
    score = model.predict(X_test)

def get_house_community_clf(database):
    database = ""
    