########################################################################################

## Upsert hashtagStats

########################################################################################

## (optional) libraries in trigger function

In [1]:
import pymongo # connect to MongoDB
from pymongo import MongoClient
from IPython.display import clear_output # clear output everytime running
from pprint import pprint

## mandatory libraries in trigger function

In [2]:
import json
import sys
# from mongo_client import mongo_client
from bson.objectid import ObjectId
from bson import regex
from datetime import datetime, timedelta
import re
import math

In [3]:
# connect to MongoDB
## define connection URI as role; analytics-admin
connectionUri = 'mongodb+srv://analytics-admin:pnYT55BGWwHePK1M@dev-cluster.fg2e5.mongodb.net/myFirstDatabase?retryWrites=true&w=majority'

## assign client
client = pymongo.MongoClient(connectionUri)

## assign databases
appDb = client['app-db']
analyticsDb = client['analytics-db']

## assign collections
### source collections
contents = appDb['contents']

### destination collections
userStats = analyticsDb['userStats']
hashtagStats = analyticsDb['hashtagStats']


########################################################################################

########################################################################################

## Hashtag parsing

### note!!!: Be careful! running this cell cause collection changes

### below cell just for filtering in testing

In [4]:
# define RegEx
namepattern = regex.Regex.from_native(re.compile(r"(?<=#)\w+"))
namepattern.flags ^= re.UNICODE

tagpattern = regex.Regex.from_native(re.compile(r"#\w+"))
tagpattern.flags ^= re.UNICODE

In [5]:
# define RegEx
pattern = regex.Regex.from_native(re.compile(r"(?<=#)\w+"))
pattern.flags ^= re.UNICODE

# define content parameters
contentDateThreshold = 14
likedWeight = 1
commentedWeight = 1
recastedWeight = 1
quotedWeight = 1
halfLifeHours = 24

In [8]:
# define cursor
cursor = [
    {
        # filter contents for newer than specific age
        '$match': {
            'createdAt': {
                '$gte': (datetime.utcnow() - timedelta(days=contentDateThreshold)) 
                }
        }
#     }, {
#             # !!! TESTING filter
#         '$match': {
#             'payload.message': {
#                     '$regex': tagpattern
#                 }
#         }
    }, {
        # extract hashtag => array
        '$addFields': {
            'hashtags': {
                '$regexFindAll': {
                    'input': '$payload.message', 
                    'regex': pattern
                }
            }
        }
    }, {
        # deconstruct hashtags array => hashtag object
        '$unwind': {
            'path': '$hashtags', 
            'preserveNullAndEmptyArrays': True
        }
    }, {
        # extract hashtag object => field
        '$addFields': {
        'name': {
            '$toLower': '$hashtags.match'
            }
        }
    }, {
        # filter non-hashtag contents out
        '$match': {
            'name': {
              '$ne': ''
          }
        }
    }, {
        # summarize by user (not account)
        # collect contentId as array
        '$group': {
            '_id': {
                'name': '$name',
                'authorId': '$author.id'
            }, 
            'contributionCount': {
                '$count': {}
            }, 
            'createdAt': {
                '$min': '$createdAt'
            }, 
            'updatedAt': {
                '$max': '$updatedAt'
            },
            'contents': {
                '$push': "$_id"
            },
            # ! follow app-db.hashtags schema
            '__v': {
                '$max': '$__v'
            },
            'likedCount': {
                '$sum': '$engagements.like.count'
            },
            'commentedCount': {
                '$sum': '$engagements.comment.count'
            },
            'recastedCount': {
                '$sum': '$engagements.recast.recast'
            },
            'quotedCount': {
                '$sum': '$engagements.quote.recast'
            }
        }
    }, {
        # summarize by hashtag
        '$group': {
            '_id': '$_id.name', 
            'hashtagCount': {
                '$sum': '$contributionCount'
            }, 
            'contributorCount': {'$count': {}},
            'createdAt': {
                '$min': '$createdAt'
            }, 
            'updatedAt': {
                '$max': '$updatedAt'
            },
            # ! follow app-db.hashtags schema
            '__v': {
                '$max': '$__v'
            },
            'contributions': {
                '$push': {
                    '_id': '$_id.authorId', 
                    'contributionCount': '$contributionCount', 
                    'contents': "$contents",
                }
            },
            'likedCount': {
                '$sum': '$likedCount'
            },
            'commentedCount': {
                '$sum': '$commentedCount'
            },
            'recastedCount': {
                '$sum': '$recastedCount'
            },
            'quotedCount': {
                '$sum': '$quotedCount'
            }
        }    
    }, {
        # setting output format
        '$project': {
            '_id': 0,  
            'name': '$_id',
            '__v': '$__v',
            'createdAt': 1, 
            'updatedAt': 1, 
            'aggregator.contributions': '$contributions',
            # calculate fraction of hashtag diversity
            ## equation: hastagDiversityScore = n_{user|hashtag}/n_{content|hashtag}
            'aggregator.hastagDiversityScore': {
                '$divide': [
                    '$contributorCount', '$hashtagCount'
                ]
            },
            # calculate linear combination of engagements 
            ## equation: engagementScore = {\sigma}_{k}({\beta}_{k}*x_{k})
            'aggregator.engagementScore': {
                '$sum': [
                    {
                        '$multiply': [
                            '$commentedCount', commentedWeight
                        ]
                    }, {
                        '$multiply': [
                            '$recastedCount', recastedWeight
                        ]
                    }, {
                        '$multiply': [
                            '$quotedCount', quotedWeight
                        ]
                    }, {
                        '$multiply': [
                            '$quotedCount', quotedWeight
                        ]
                    }
                ]
            },
            # calculate decay from last update time
            ## equation: ageScore = e^(-{\lambda}*t)
            'aggregator.ageScore': {
                '$exp': {
                    '$multiply': [
                        {
                            '$multiply': [
                                {
                                    # calculate age from last update time
                                    '$divide': [
                                        {
                                            '$subtract': [datetime.utcnow(), "$updatedAt"]
                                        }, 60*60*1000
                                    ]
                                }, {
                                    # define lambda value
                                    '$divide': [{'$ln': 2}, halfLifeHours]
                                }
                            ]
                        }, -1
                    ]
                }
            }            
        }
    }, {
        # summarize all scores
        ## equation: score = ageScore*(engagementScore + 1)*(hastagDiversityScore)
        '$addFields': {
            'score': {
                '$multiply': [
                    '$aggregator.hastagDiversityScore',
                    # add 1 as bias
                    {'$add': ['$aggregator.engagementScore', 1]},
                    '$aggregator.ageScore'
                ]
            }
        }   
#     }, {
#         # upsert to 'hashtagStats' collection
#         '$merge': {
#             'into': {
#                 'db': 'analytics-db', 
#                 'coll': 'hashtagStats'
#             }, 
#             'on': '_id', 
#             'whenMatched': 'replace', 
#             'whenNotMatched': 'insert'
#         }
    }
]

In [9]:
# clear the output
clear_output()

# print output
pprint(list(contents.aggregate(cursor)))

[{'__v': 0,
  'aggregator': {'ageScore': 0.06951420964228329,
                 'contributions': [{'_id': ObjectId('614addffec903a9d987eb580'),
                                    'contents': [ObjectId('615ac3bfae203552b62d9805'),
                                                 ObjectId('6163c5fda52e7265ff17189b'),
                                                 ObjectId('6163fada17d825e87a079483'),
                                                 ObjectId('6163ff8b17d825444207953d')],
                                    'contributionCount': 4}],
                 'engagementScore': 0,
                 'hastagDiversityScore': 0.25},
  'createdAt': datetime.datetime(2021, 10, 4, 9, 5, 3, 152000),
  'name': 'mondaymotivation',
  'score': 0.017378552410570822,
  'updatedAt': datetime.datetime(2021, 10, 11, 9, 10, 35, 337000)},
 {'__v': 0,
  'aggregator': {'ageScore': 0.6620041810167421,
                 'contributions': [{'_id': ObjectId('6149a7c33e6269b3adabc900'),
                      

  'updatedAt': datetime.datetime(2021, 10, 14, 15, 12, 41, 525000)},
 {'__v': 0,
  'aggregator': {'ageScore': 0.6620041810167421,
                 'contributions': [{'_id': ObjectId('6149a7c33e6269b3adabc900'),
                                    'contents': [ObjectId('6161cbbda52e72448017136d')],
                                    'contributionCount': 1}],
                 'engagementScore': 0,
                 'hastagDiversityScore': 1.0},
  'createdAt': datetime.datetime(2021, 10, 9, 17, 5, 1, 573000),
  'name': 'messi',
  'score': 0.6620041810167421,
  'updatedAt': datetime.datetime(2021, 10, 14, 15, 12, 41, 525000)},
 {'__v': 0,
  'aggregator': {'ageScore': 0.002850502242636976,
                 'contributions': [{'_id': ObjectId('614addffec903a9d987eb580'),
                                    'contents': [ObjectId('615dec56db45d0092d925857')],
                                    'contributionCount': 1}],
                 'engagementScore': 0,
                 'hastagDiversitySco