### Import required libraries

In [1]:
import sqlite3
import pandas as pd
import numpy as np


### Connect to sqlite3 database

In [2]:
# connect function
conn = sqlite3.connect("SimilarUsr.db")
user_id = []

### Define pickled model name and path saved

In [48]:
model_directory = 'model'
model_file_name_eu = '%s/knn_eu.pkl' % model_directory
model_file_name_cos = '%s/knn_cosine.pkl' % model_directory
model_columns_file_name = '%s/knn_columns.pkl' % model_directory

### Dataset - handling data

In [4]:
# file 1: assessment_scores
df_assess = pd.read_sql_query("select user_handle, assessment_tag,user_assessment_score from user_assessment_scores", conn)
user_id_assess = df_assess['user_handle']
assess_tags = df_assess['assessment_tag']
user_id_assess = list(set(user_id_assess))
assess_tags = list(set(assess_tags))

user_id += user_id_assess
print ("number of user_id_asses: %s" %len(user_id_assess))
print ("number of assess_tags: %s" %len(assess_tags))
print (assess_tags[:5])


number of user_id_asses: 3114
number of assess_tags: 54
['using-microsoft-office-2016', 'html5', 'javascript', 'game-environment-art', 'group-policy-administration-and-troubleshooting']


In [5]:
# file 2 user_interests
df_interest = pd.read_sql_query("select user_handle, interest_tag from user_interests", conn)
user_id_interest = df_interest['user_handle']
interest_tags = df_interest['interest_tag']

user_id_interest = list(set(user_id_interest))
interest_tags = list(set(interest_tags))
print ("number of user_id_interest: %s" %len(user_id_interest))
user_id += user_id_interest
print ("number of interest tags: %s" %len(interest_tags))

number of user_id_interest: 10000
number of interest tags: 748


In [6]:
# file 3 course_views
df_view = pd.read_sql_query("select user_handle, course_id,view_time_seconds, level from user_course_views", conn)
user_id_view = df_view['user_handle']
view_tags = df_view['course_id']

user_id_view = list(set(user_id_view))
view_tags = list(set(view_tags))
print ("number of user_id_view: %s"%len(user_id_view))
user_id += user_id_view
print ("number of view_tags: %s"%len(view_tags))

number of user_id_view: 8760
number of view_tags: 5942


In [7]:
# set user_id
user_id = list(set(user_id))
print ("number of users: %s"%len(user_id))
print (user_id[:5])

number of users: 10000
['8075', '1649', '6329', '5965', '7900']


### Build Feature Matrix

#### Step1: Initialize three matrix with initialized value zero

In [8]:
## Construct three feature matrix: assess_X, interest_X, view_X
import numpy as np

M = len(user_id)
# Initialize Feature Matrix
assess_X = np.zeros(shape = (M, len(assess_tags)))
interest_X = np.zeros(shape = (M, len(interest_tags)))
view_X = np.zeros(shape = (M, len(view_tags)))

print (assess_X.shape)
print (interest_X.shape)
print (view_X.shape)


(10000, 54)
(10000, 748)
(10000, 5942)


#### Step2:  fill matrix value

In [9]:
# Assess_X
assess_m = df_assess['user_handle']
assess_n = df_assess['assessment_tag']
assess_score = df_assess['user_assessment_score']
for i in range(len(assess_m)):
    m = user_id.index(assess_m[i])
    n = assess_tags.index(assess_n[i])
    assess_X[m][n] = assess_score[i]
print (assess_X)
df_assess_new = pd.DataFrame(assess_X, index = user_id, columns = assess_tags)
df_assess_new[:5]

[[  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]
 [  0.   0.   0. ...   0.   0.   0.]
 ...
 [  0.   0.   0. ...   0.   0.   0.]
 [  0.   0. 119. ...   0.   0.   0.]
 [  0. 228. 198. ...   0.   0.   0.]]


Unnamed: 0,using-microsoft-office-2016,html5,javascript,game-environment-art,group-policy-administration-and-troubleshooting,angular-js,python,maya-core-skills,autocad,sharepoint,...,css,angular,puppet,maya-modeling,getting-started-in-houdini,premiere-pro-cc,maya-character-modeling,security-for-hackers-and-developers,jquery,windows-server-management
8075,0,0,0,0,0,0,83,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1649,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5965,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7900,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# pickled assess column tags
from sklearn.externals import joblib
model_assess_columns = list(df_assess_new.columns)
print (model_assess_columns)
joblib.dump(model_assess_columns, 'model/assess_columns.pkl')

['using-microsoft-office-2016', 'html5', 'javascript', 'game-environment-art', 'group-policy-administration-and-troubleshooting', 'angular-js', 'python', 'maya-core-skills', 'autocad', 'sharepoint', 'solidworks-core-skills', 'after-effects-cc', 'c-plus-plus', 'azure-for-developers', 'android', 'maya-getting-started-with-modeling-in-maya', 'android-associate-developer', 'video-production', 'photoshop', 'c-sharp', 'illustrator', 'aspnet-mvc-5', 'powershell', 'unity-game-development-core-skills', 'zbrush-core-skills', 'azure-administrator', 'the-scrum-framework', 'docker', 'maya-environment-modeling', 'fusion-360-core-skills', 'react', 'revit-architecture-modeling-families', 'mysql', 'photoshop-photo-manipulation', 'java', 'node-js', 'azure-infrastructure-as-a-service', 'graphic-design', 'indesign-cc-creative-professionals', 'active-directory-administration', 'nuke-core-skills', 'revit-architecture', '3ds-max-environment-modeling', 'azure-solutions-architect', 'css', 'angular', 'puppet', 

['model/assess_columns.pkl']

In [14]:
# interest_X
interest_m = df_interest['user_handle']
interest_n = df_interest['interest_tag']
for i in range(len(interest_m)):
    m = user_id.index(interest_m[i])
    n = interest_tags.index(interest_n[i])
    interest_X[m][n] = 1
print (interest_X.shape)
df_interest_new = pd.DataFrame(interest_X, index = user_id, columns = interest_tags)
df_interest_new[:5]

(10000, 748)


Unnamed: 0,ios,unit-testing,windows-debugging,data,cryptography,excel,career-path,vb.net,photography,windows-8-development,...,database-design,ansys,rethinkdb,3d-sculpting,routing,wireless-networking,javascript-frameworks,unreal-engine,algorithms,speedtree
8075,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1649,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5965,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7900,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
model_interest_columns = list(df_interest_new.columns)
# print (model_interest_columns)
joblib.dump(model_interest_columns, 'model/interest_columns.pkl')

['model/interest_columns.pkl']

In [16]:
# view_X
level_d = {'Beginner':1, 'Intermediate':2, 'Advanced':3}
view_m = df_view['user_handle']
view_n = df_view['course_id']
view_level = df_view['level']
view_time = df_view['view_time_seconds']
# print (view_time[0])
for i in range(len(view_m)):
    m = user_id.index(view_m[i])
    n = view_tags.index(view_n[i])
    level = level_d[view_level[i]]
    time = float(view_time[i])
    view_X[m][n] += level * time
df_view_new = pd.DataFrame(view_X, index = user_id, columns = view_tags)
df_view_new[:5]

Unnamed: 0,troubleshooting-exam-prep-cisco-ccna-200-125-200-105,business-catalyst-fundamentals,java-patterns-concurrency-multi-threading,scrum-master-fundamentals-scrum-master,aws-developer-getting-started,developing-python-tools-nuke-2219,clojure-fundamentals-part-one,building-software-that-lasts-guide-to-maintainable-software,ocp-12c-using-new-security-features,building-mobile-apps-ionic-framework-angularjs,...,tcp-ip-networking-for-devs,csharp-language-internals-part2,energizing-logos-3d-animation-photoshop-1061,play-by-play-just-enough-administration,nuke-channel-fundamentals,professional-series-texturing-military-vehicles-mari-171,quick-start-unreal-engine-4-3-1750,solidworks-essentials-in-depth-sweeps,penetration-testing-sharepoint,mstest
8075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1649,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5965,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7900,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [141]:
df_view_new.values[5350]

array([0., 0., 0., ..., 0., 0., 0.])

In [17]:
model_view_columns = list(df_view_new.columns)
# print (model_view_columns)
joblib.dump(model_view_columns, 'model/view_columns.pkl')

['model/view_columns.pkl']

#### Normalize Feature Vector to range[0,1]

In [18]:
# Normalize all feature Matrix value using sklearn to range(0,1)
from sklearn import preprocessing
# assess_X
assess_X_min = np.min(assess_X, axis = 0)
assess_X_max = np.max(assess_X, axis = 0)
print (assess_X_min)
print (assess_X_max)
min_max_scaler = preprocessing.MinMaxScaler()
assess_x = min_max_scaler.fit_transform(assess_X)
# print (assess_x)

df_assess_new_norm = pd.DataFrame(assess_x, index = user_id, columns = assess_tags)
df_assess_new_norm[:5]



[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0.]
[245. 292. 275. 260. 273. 270. 277. 224. 241. 246. 137. 230. 298. 262.
 276. 170. 242. 166. 253. 275. 238. 292. 252. 220. 240. 241. 284. 289.
 239. 144. 264. 127. 270. 234. 280. 269. 239. 275. 237. 252. 172. 148.
 255. 247. 270. 259. 179. 233. 230. 176. 227. 218. 268. 168.]


Unnamed: 0,using-microsoft-office-2016,html5,javascript,game-environment-art,group-policy-administration-and-troubleshooting,angular-js,python,maya-core-skills,autocad,sharepoint,...,css,angular,puppet,maya-modeling,getting-started-in-houdini,premiere-pro-cc,maya-character-modeling,security-for-hackers-and-developers,jquery,windows-server-management
8075,0,0,0,0,0,0,0.299639,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1649,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6329,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5965,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7900,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# view_X Normalization
view_X_min = np.min(view_X, axis = 0)
view_X_max = np.max(view_X, axis = 0)
print (view_X_min)
print (view_X_max)

view_x = min_max_scaler.fit_transform(view_X)
df_view_new_norm = pd.DataFrame(view_x, index = user_id, columns = view_tags)
df_view_new_norm[:5]
# print (view_X)

[0. 0. 0. ... 0. 0. 0.]
[21644.  2714. 69644. ...  1876. 27652. 36252.]


Unnamed: 0,troubleshooting-exam-prep-cisco-ccna-200-125-200-105,business-catalyst-fundamentals,java-patterns-concurrency-multi-threading,scrum-master-fundamentals-scrum-master,aws-developer-getting-started,developing-python-tools-nuke-2219,clojure-fundamentals-part-one,building-software-that-lasts-guide-to-maintainable-software,ocp-12c-using-new-security-features,building-mobile-apps-ionic-framework-angularjs,...,tcp-ip-networking-for-devs,csharp-language-internals-part2,energizing-logos-3d-animation-photoshop-1061,play-by-play-just-enough-administration,nuke-channel-fundamentals,professional-series-texturing-military-vehicles-mari-171,quick-start-unreal-engine-4-3-1750,solidworks-essentials-in-depth-sweeps,penetration-testing-sharepoint,mstest
8075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1649,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5965,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7900,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Concatenate three submaxtrix, Build one Feature Matrix

In [66]:
# Concatenate Feature Matrix with weight 1:1:1
feature_X = np.hstack((assess_x, view_x, interest_X))
print (feature_X.shape)
df_feature = pd.DataFrame(feature_X, index = user_id, columns = assess_tags + view_tags + interest_tags)
df_feature[:5]

(10000, 6744)


Unnamed: 0,using-microsoft-office-2016,html5,javascript,game-environment-art,group-policy-administration-and-troubleshooting,angular-js,python,maya-core-skills,autocad,sharepoint,...,database-design,ansys,rethinkdb,3d-sculpting,routing,wireless-networking,javascript-frameworks,unreal-engine,algorithms,speedtree
8075,0,0,0,0,0,0,0.299639,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1649,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6329,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5965,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7900,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
# pickle model columns
module_columns = None
# print (df_feature.columns)
model_columns = list(df_feature.columns)
print (len(model_columns))
joblib.dump(model_columns, model_columns_file_name)

6744


['model/knn_columns.pkl']

### Train Nearest Neighbors model

In [74]:
# KNeighbor model
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors.ball_tree import BallTree

neigh_eu = NearestNeighbors(n_neighbors=5, metric= 'euclidean')
neigh_eu.fit(feature_X)

neigh_cos = NearestNeighbors(n_neighbors=5, algorithm = 'brute', metric='cosine')
neigh_cos.fit(feature_X)

test = np.array([feature_X[0]])

res_eu = neigh_eu.kneighbors(test)
res_cos = neigh_cos.kneighbors(test)   
print ("similar users using metric Euclidean: ") 
print (res_eu)
print ("similar users using metric Cosine: ")
print (res_cos)

similar users using metric Euclidean: 
(array([[0.        , 4.25320862, 4.59887012, 4.7006226 , 4.72601181]]), array([[   0, 7378, 8671, 3336, 9625]]))
similar users using metric Cosine: 
(array([[0.        , 0.20494449, 0.2418678 , 0.24483844, 0.25845499]]), array([[   0, 7378, 8671, 8829, 3336]]))


### Save pickled model into model directory

In [70]:
from sklearn.externals import joblib
joblib.dump(neigh_eu, model_file_name_eu)


['model/knn_eu.pkl',
 'model/knn_eu.pkl_01.npy',
 'model/knn_eu.pkl_02.npy',
 'model/knn_eu.pkl_03.npy',
 'model/knn_eu.pkl_04.npy',
 'model/knn_eu.pkl_05.npy',
 'model/knn_eu.pkl_06.npy',
 'model/knn_eu.pkl_07.npy']

In [71]:
joblib.dump(neigh_cos, model_file_name_cos)

['model/knn_cosine.pkl', 'model/knn_cosine.pkl_01.npy']

In [24]:
print (feature_X[1081])

[0. 0. 0. ... 0. 0. 0.]


In [None]:
# Cosine Similarity Matrix


### Given user handle, to get this user's feature vector 
#### e.g. User_handle = 7487 

#### Select feature from assessment table

In [25]:
# assess_test
test_usr_assess = pd.read_sql_query("select assessment_tag,user_assessment_score from user_assessment_scores where user_handle = 7487", conn)
# test_usr_assess
s_ = test_usr_assess['user_assessment_score'].tolist()
tag_ = test_usr_assess['assessment_tag'].tolist()
# print (tag_)
df_test_assess = pd.DataFrame([s_], columns = tag_ )
df_test_assess

Unnamed: 0,angular-js,css,html5,java,javascript,python
0,134,38,84,149,92,139


#### Load assessment feature columns, reindx

In [101]:
# reindex test user assessment columns
df_test_assess = df_test_assess.reindex(columns=joblib.load('model/assess_columns.pkl'), fill_value=0)
df_test_assess

[0.         0.28767123 0.33454545 0.         0.         0.4962963
 0.50180505 0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.53214286 0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.14074074 0.         0.         0.
 0.         0.         0.         0.         0.         0.        ]


Unnamed: 0,using-microsoft-office-2016,html5,javascript,game-environment-art,group-policy-administration-and-troubleshooting,angular-js,python,maya-core-skills,autocad,sharepoint,...,css,angular,puppet,maya-modeling,getting-started-in-houdini,premiere-pro-cc,maya-character-modeling,security-for-hackers-and-developers,jquery,windows-server-management
0,0,84,92,0,0,134,139,0,0,0,...,38,0,0,0,0,0,0,0,0,0


#### Normalize assessment feature to range[0,1]

In [102]:
# Normalization value in test_assess
test_assess = df_test_assess.values
test_assess = test_assess.astype(float)
test_assess_norm = (test_assess - assess_X_min) / (assess_X_max - assess_X_min)
print (test_assess)
print (test_assess_norm)
# print (feature_X[5350][:54])
# (test_assess_norm[0] == feature_X[5350][:54]).all()

[[  0.  84.  92.   0.   0. 134. 139.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.
    0.   0.   0.   0.   0.   0. 149.   0.   0.   0.   0.   0.   0.   0.
    0.   0.  38.   0.   0.   0.   0.   0.   0.   0.   0.   0.]]
[[0.         0.28767123 0.33454545 0.         0.         0.4962963
  0.50180505 0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.53214286 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.14074074 0.         0.         0.
  0.         0.         0.         0.         0.         0.        ]]
[0.         0.28767123 0.33454545 0.         0.         0.4962963
 0.50180505 0.         0.         0.         0.         0.
 0.         0.         0.         0.  

In [103]:
(test_assess_norm[0] == feature_X[5350][:54]).all()

True

#### Select Data From user_interests table 

In [28]:
# test user interest
test_usr_interest = pd.read_sql_query("select interest_tag from user_interests where user_handle = 7487", conn)
test_usr_interest

Unnamed: 0,interest_tag
0,angular
1,javascript-frameworks
2,javascript-libraries
3,javascript
4,java
5,react.js
6,python
7,nodejs
8,ice-cream-sandwich
9,android


#### Numerize user interests feature vector value

In [29]:
tag_2 = test_usr_interest['interest_tag'].tolist()
df_test_interest = pd.DataFrame([[1]*len(tag_2)], columns = tag_2 )
df_test_interest

Unnamed: 0,angular,javascript-frameworks,javascript-libraries,javascript,java,react.js,python,nodejs,ice-cream-sandwich,android,...,hibernate,elixir,rethinkdb,nosql,cassandra,gradle,java-ee,groovy,ember.js,linux
0,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


#### load interests feature column and reindex

In [30]:
# reindex test_interest
df_test_interest = df_test_interest.reindex(columns=joblib.load('model/interest_columns.pkl'), fill_value=0)
df_test_interest

Unnamed: 0,ios,unit-testing,windows-debugging,data,cryptography,excel,career-path,vb.net,photography,windows-8-development,...,database-design,ansys,rethinkdb,3d-sculpting,routing,wireless-networking,javascript-frameworks,unreal-engine,algorithms,speedtree
0,0,1,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0


In [31]:
test_interest_norm = df_test_interest.values
test_interest_norm

array([[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 

In [133]:
(test_interest_norm[0] == feature_X[5350][-748:]).all()

True

#### Select Data From user_views table and Numeraize Feature Vector

In [163]:
# Extract feature for test_view
test_usr_view = pd.read_sql_query("select course_id,view_time_seconds, level from user_course_views where user_handle = 7487", conn)
test_usr_view 

level_test = test_usr_view['level'].tolist()
level_test = [level_d[j] for j in level_test]
score_test = test_usr_view['view_time_seconds'].tolist()
p = [level_test[i] * float(score_test[i]) for i in range(len(level_test))]
tag_3 = test_usr_view['course_id'].tolist()
# print (tag_)
df_test_view = pd.DataFrame([p], columns = tag_3 )
# print (df_test_view['java-fundamentals-core-platform'].values)
df_test_view = df_test_view.groupby(lambda x:x, axis=1).sum()
df_test_view

Unnamed: 0,apex-absolute-beginner-guide-coding-salesforce,cloud-computing,how-git-works,java-fundamentals-core-platform,java-fundamentals-language,mastering-git,maven-fundamentals,rest-fundamentals,spring-fundamentals
0,977,4074,6536,15489,28461,22476,18024,22610,10836


In [158]:
len(df_test_view.values[0])
print (list(set(tag_3)))
df_view_new[list(set(tag_3))].values[5350]

['java-fundamentals-core-platform', 'cloud-computing', 'how-git-works', 'maven-fundamentals', 'spring-fundamentals', 'java-fundamentals-language', 'apex-absolute-beginner-guide-coding-salesforce', 'rest-fundamentals', 'mastering-git']


array([30978.,  8148., 13072., 36048., 21672., 56922.,  1954., 45220.,
       44952.])

In [159]:
df_test_view[tag_3]

Unnamed: 0,spring-fundamentals,spring-fundamentals.1,spring-fundamentals.2,how-git-works,spring-fundamentals.3,how-git-works.1,cloud-computing,how-git-works.2,mastering-git,mastering-git.1,...,java-fundamentals-core-platform,java-fundamentals-language,java-fundamentals-core-platform.1,java-fundamentals-core-platform.2,java-fundamentals-core-platform.3,java-fundamentals-core-platform.4,java-fundamentals-core-platform.5,apex-absolute-beginner-guide-coding-salesforce,java-fundamentals-core-platform.6,java-fundamentals-core-platform.7
0,10836,10836,10836,6536,10836,6536,4074,6536,22476,22476,...,15489,28461,15489,15489,15489,15489,15489,977,15489,15489


In [160]:
# reindex test_view
df_test_view = df_test_view.reindex(columns=joblib.load('model/view_columns.pkl'), fill_value=0)
df_test_view


Unnamed: 0,troubleshooting-exam-prep-cisco-ccna-200-125-200-105,business-catalyst-fundamentals,java-patterns-concurrency-multi-threading,scrum-master-fundamentals-scrum-master,aws-developer-getting-started,developing-python-tools-nuke-2219,clojure-fundamentals-part-one,building-software-that-lasts-guide-to-maintainable-software,ocp-12c-using-new-security-features,building-mobile-apps-ionic-framework-angularjs,...,tcp-ip-networking-for-devs,csharp-language-internals-part2,energizing-logos-3d-animation-photoshop-1061,play-by-play-just-enough-administration,nuke-channel-fundamentals,professional-series-texturing-military-vehicles-mari-171,quick-start-unreal-engine-4-3-1750,solidworks-essentials-in-depth-sweeps,penetration-testing-sharepoint,mstest
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Normalize user_views feature vector

In [161]:
# Normalization test_view
test_view = df_test_view.values
test_view = test_view.astype(float)
test_view_norm = (test_view - view_X_min) / (view_X_max - view_X_min)
print (test_view)
print (test_view_norm)

[[0. 0. 0. ... 0. 0. 0.]]
[[0. 0. 0. ... 0. 0. 0.]]


In [162]:
(test_view_norm[0] == feature_X[5350][54:-748]).all()

# len(feature_X[5350][54:-748])
# feature_X[5350][812:815]

False

#### Concatenate three feature vector to build one user feature vector

In [164]:
# concatenate columns to get test_user 
test_user = np.concatenate((test_assess_norm, test_view_norm, test_interest_norm), axis = 1)
# test_user
print (len(test_user[0]))
# test_user = feature_X[5350]

6744


In [165]:
# dump test_user into test_user,pkl, can be loaded in ap.py file 
test_pkl = joblib.dump(test_user, 'test_user.pkl')

In [166]:
test_u = joblib.load('test_user.pkl')
test_u

array([0.        , 0.28767123, 0.33454545, ..., 0.        , 0.        ,
       0.        ])

In [76]:
res_eu_1 = neigh_eu.kneighbors(test_u)
print (res_eu_1)
res_cos_1 = neigh_cos.kneighbors(test_u) 
print (res_cos_1)

(array([[5.64559396e+26, 5.64559396e+26, 5.64559396e+26, 5.64559396e+26,
        5.64559396e+26]]), array([[ 158, 1357,  682,   75,  327]]))
(array([[0.74223469, 0.78135093, 0.78941475, 0.81580758, 0.81787627]]), array([[9367, 6997, 9589, 9183,  382]]))


In [79]:
print (user_id.index('7487'))

5350


In [97]:
t = feature_X[5350]
t

array([0.        , 0.28767123, 0.33454545, ..., 0.        , 0.        ,
       0.        ])

In [167]:
res_eu_1 = neigh_eu.kneighbors(t)
print (res_eu_1)
res_cos_1 = neigh_cos.kneighbors(t) 
print (res_cos_1)



(array([[0.        , 6.58804389, 6.62030446, 6.6456036 , 6.73316383]]), array([[5350, 4019, 2919,  745,  898]]))




(array([[2.22044605e-16, 3.73865545e-01, 3.74451352e-01, 3.76506090e-01,
        3.77091097e-01]]), array([[5350, 4019, 2919, 9334, 4295]]))


In [168]:
(test_u == feature_X[5350]).all()

True

In [169]:
user_pkl = joblib.dump(user_id, 'users.pkl')