# Note

context: Python

effective library: xgboost

In [1]:
# import libraries
import pymongo # connect to MongoDB
from pymongo import MongoClient # client connection to MongoDB
import sklearn
import pandas as pd
import json
import xgboost as xgb

In [2]:
string = '/Sports/Team Sports/Soccer'

In [3]:
string.split('/')

['', 'Sports', 'Team Sports', 'Soccer']

In [4]:
# connect to MongoDB
## define connection URI as role; analytics-admin
connectionUri = 'mongodb+srv://analytics-admin:pnYT55BGWwHePK1M@dev-cluster.fg2e5.mongodb.net/myFirstDatabase?retryWrites=true&w=majority'

## assign client
client = pymongo.MongoClient(connectionUri)

## assign databases
appDb = client['app-db']
analyticsDb = client['analytics-db']

## assign collections
### source collections
users = appDb['users']
contents = appDb['contents']

### destination collections
userStats = analyticsDb['userStats']
hashtagStats = analyticsDb['hashtagStats']

In [5]:
## USERS download
users = appDb['users']

# explore schema
user = pd.DataFrame(list(users.find()))

## content download
contents = appDb['contents']

# explore schema
content = pd.DataFrame(list(contents.find()))

## creator download
creatorStats = analyticsDb['creatorStats']

# explore schema
creatorStat = pd.DataFrame(list(creatorStats.find()))

## hashtag download

hashtagStats = analyticsDb['hashtagStats']

# explore schema
hashtagStat = pd.DataFrame(list(hashtagStats.find()))

## hashtag download
trans = analyticsDb['transactionEngagements']

# explore schema
trans = pd.DataFrame(list(trans.find()))

## creator download
contentStats = analyticsDb['contentStats']

contentStats = pd.DataFrame(list(contentStats.find()))

## creator download
contentFeatures = analyticsDb['contentFeatures']

contentFeatures = pd.DataFrame(list(contentFeatures.find()))

## addition set up

In [6]:
# additional things
import bson.objectid
import pickle
from datetime import datetime
from pprint import pprint
import numpy as np

mlArtifacts = analyticsDb['mlArtifacts']

In [7]:
contentFeatures_1 = contentFeatures.fillna(0).rename({'_id':'contentId'},axis = 1).drop('userId',axis = 1)

In [8]:
trans.head()

Unnamed: 0,_id,userId,contentId,like,comment,recast,quote
0,617fb3c9e884e32f1e45ff29,6170eb21e5ddcb429e04e7d7,617bcb1a6ddb631e0c7ed63e,0,0,1,0
1,617fb3c9e884e32f1e45ff2a,6170eb21e5ddcb429e04e7d7,617ba5fc47928dfe564d15c1,0,0,1,0
2,617fb3c9e884e32f1e45ff2b,6170eb21e5ddcb429e04e7d7,617ba4a947928dbe1f4d15b4,0,0,1,0
3,617fb3c9e884e32f1e45ff2c,6170eb21e5ddcb429e04e7d7,617ba62347928d94564d15ce,0,0,1,0


In [9]:
trans_add = trans.merge(contentFeatures_1, on = 'contentId',how ='left')
trans_add['label'] = trans_add['like']+trans_add['comment'] +trans_add['recast'] +trans_add['quote']  
trans_add = trans_add.drop(['_id'],axis = 1)

In [10]:
trans_add.head()

Unnamed: 0,userId,contentId,like,comment,recast,quote,countComment,countLike,countQuote,countRecast,creatorContentComment,creatorContentCount,creatorContentLike,seen,characterCount,imageCount,ageScore,label
0,6170eb21e5ddcb429e04e7d7,617bcb1a6ddb631e0c7ed63e,0,0,1,0,,,,,,,,,,,,1
1,6170eb21e5ddcb429e04e7d7,617ba5fc47928dfe564d15c1,0,0,1,0,,,,,,,,,,,,1
2,6170eb21e5ddcb429e04e7d7,617ba4a947928dbe1f4d15b4,0,0,1,0,,,,,,,,,,,,1
3,6170eb21e5ddcb429e04e7d7,617ba62347928d94564d15ce,0,0,1,0,,,,,,,,,,,,1


In [None]:
select_user = trans_add.groupby('userId')['contentId'].agg('count').reset_index()
select_user = select_user[select_user['contentId'] > 2]

In [None]:
# define upsert function
def save_model_to_mongodb(collection, model_name, account, model):
    
    pickled_model = pickle.dumps(model) # pickling the model
    
    document = collection.update_one(
        {
            'account': account,
            'model': str(model_name),
        }, {
            '$set': {
                'account': account,
                'model': str(model_name),
                'artifact': pickled_model,
                'trainedAt': datetime.now()
            }
        }, upsert= True)

In [None]:
ml_artifacts = [] # pre-define model artifacts

for n in list(select_user.userId.unique()):
    
    focus_trans = trans_add[trans_add['userId'] == n]  
    portion = focus_trans.groupby('userId').agg( 
                                like_count = ('like','sum'),
                                comment_count = ('comment','sum'),
                                recast_count = ('recast','sum'),
                                quote_count = ('quote','sum')
                                                ).reset_index().replace(0,1)
    
    portion = portion[['like_count','comment_count','recast_count','quote_count']].div(portion.sum(axis=1)[0]).div(-1)+1
    focus_trans.loc[:,'like'] = focus_trans.loc[:,'like']*portion.loc[0,'like_count']
    focus_trans.loc[:,'comment'] = focus_trans.loc[:,'comment']*portion.loc[0,'comment_count']
    focus_trans.loc[:,'recast'] = focus_trans.loc[:,'recast']*portion.loc[0,'recast_count']
    focus_trans.loc[:,'quote'] = focus_trans.loc[:,'quote']*portion.loc[0,'quote_count']
    focus_trans['label'] = focus_trans['like']+focus_trans['comment'] +focus_trans['recast'] +focus_trans['quote']  

    Xlr = focus_trans.drop(['label','userId','contentId','like','comment','recast','quote'],axis = 1)
    ylr = focus_trans.label

    xg_reg = xgb.XGBRegressor()
    xg_reg.fit(Xlr, ylr)
    
    pprint(n)
    ml_artifacts.append(xg_reg) # collect list of artifacts
    
    # upsert 
    save_model_to_mongodb(collection=mlArtifacts,
                          account=n,
                          model_name='xgboost',
                          model=xg_reg)

In [None]:
# define loading model artifact
def load_model_from_mongodb(collection, model_name, account):
    
    json_data = {}
    
    # find user's model artifact
    data = collection.find({
        'account': account,
        'model': model_name
    })
    
    for i in data:
        json_data = i
    
    pickled_model = json_data['artifact']
    
    return pickle.loads(pickled_model)

In [None]:
# perform loading model
xg_reg = load_model_from_mongodb(collection=mlArtifacts,
                                 account=select_user.userId.unique()[1],
                                 model_name='xgboost')

In [None]:
content_test = contentFeatures_1.drop(['contentId'], axis = 1)

loaded_results = xg_reg.predict(content_test) # predict via loaded model

In [None]:
select_user.userId.unique()[1] # check userId

In [None]:
np.sort(loaded_results) # sort using numpy

In [None]:
len(loaded_results) # check length

In [None]:
result[(result['userId'] == '614988493e62699352abc8eb')]#&(result['contentId'] == '6160036da52e7254ae170b59')]