########################################################################################

## Upsert hashtagStats

########################################################################################

## (optional) libraries in trigger function

In [1]:
import pymongo # connect to MongoDB
from pymongo import MongoClient
from IPython.display import clear_output # clear output everytime running
from pprint import pprint

## mandatory libraries in trigger function

In [2]:
import json
import sys
# from mongo_client import mongo_client
from bson.objectid import ObjectId
from bson import regex
from datetime import datetime, timedelta
import re
import math

In [3]:
# connect to MongoDB
## define connection URI as role; analytics-admin
connectionUri = 'mongodb+srv://analytics-admin:pnYT55BGWwHePK1M@dev-cluster.fg2e5.mongodb.net/myFirstDatabase?retryWrites=true&w=majority'

## assign client
client = pymongo.MongoClient(connectionUri)

## assign databases
appDb = client['app-db']
analyticsDb = client['analytics-db']

## assign collections
### source collections
contents = appDb['contents']

### destination collections
userStats = analyticsDb['userStats']
hashtagStats = analyticsDb['hashtagStats']


########################################################################################

########################################################################################

## Hashtag parsing

### note!!!: Be careful! running this cell cause collection changes

### below cell just for filtering in testing

In [4]:
# define RegEx
namepattern = regex.Regex.from_native(re.compile(r"(?<=#)\w+"))
namepattern.flags ^= re.UNICODE

tagpattern = regex.Regex.from_native(re.compile(r"#\w+"))
tagpattern.flags ^= re.UNICODE

In [5]:
# define RegEx
pattern = regex.Regex.from_native(re.compile(r"(?<=#)\w+"))
pattern.flags ^= re.UNICODE

# define content parameters
contentDateThreshold = 14
likedWeight = 1
commentedWeight = 1
recastedWeight = 1
quotedWeight = 1
halfLifeHours = 24

In [6]:
# define cursor
cursor = [
    {
        # filter contents for newer than specific age
        '$match': {
            'createdAt': {
                '$gte': (datetime.utcnow() - timedelta(days=contentDateThreshold)) 
                }
        }
#     }, {
#             # !!! TESTING filter
#         '$match': {
#             'payload.message': {
#                     '$regex': tagpattern
#                 }
#         }
    }, {
        # extract hashtag => array
        '$addFields': {
            'hashtags': {
                '$regexFindAll': {
                    'input': '$payload.message', 
                    'regex': pattern
                }
            }
        }
    }, {
        # deconstruct hashtags array => hashtag object
        '$unwind': {
            'path': '$hashtags', 
            'preserveNullAndEmptyArrays': True
        }
    }, {
        # extract hashtag object => field
        '$addFields': {
        'name': {
            '$toLower': '$hashtags.match'
            }
        }
    }, {
        # summarize by user (not account)
        # collect contentId as array
        '$group': {
            '_id': {
                'name': '$name',
                'authorId': '$author.id'
            }, 
            'contributionCount': {
                '$count': {}
            }, 
            'createdAt': {
                '$min': '$createdAt'
            }, 
            'updatedAt': {
                '$max': '$updatedAt'
            },
            'contents': {
                '$push': "$_id"
            },
            # ! follow app-db.hashtags schema
            '__v': {
                '$max': '$__v'
            },
            'likedCount': {
                '$sum': '$engagements.like.count'
            },
            'commentedCount': {
                '$sum': '$engagements.comment.count'
            },
            'recastedCount': {
                '$sum': '$engagements.recast.recast'
            },
            'quotedCount': {
                '$sum': '$engagements.quote.recast'
            }
        }
    }, {
        # summarize by hashtag
        '$group': {
            '_id': '$_id.name', 
            'hashtagCount': {
                '$sum': '$contributionCount'
            }, 
            'contributorCount': {'$count': {}},
            'createdAt': {
                '$min': '$createdAt'
            }, 
            'updatedAt': {
                '$max': '$updatedAt'
            },
            # ! follow app-db.hashtags schema
            '__v': {
                '$max': '$__v'
            },
            'contributions': {
                '$push': {
                    '_id': '$_id.authorId', 
                    'contributionCount': '$contributionCount', 
                    'contents': "$contents",
                }
            },
            'likedCount': {
                '$sum': '$likedCount'
            },
            'commentedCount': {
                '$sum': '$commentedCount'
            },
            'recastedCount': {
                '$sum': '$recastedCount'
            },
            'quotedCount': {
                '$sum': '$quotedCount'
            }
        }    
    }, {
        # setting output format
        '$project': {
            '_id': 0,  
            'name': '$_id',
            '__v': '$__v',
            'createdAt': 1, 
            'updatedAt': 1, 
            'aggregator.contributions': '$contributions',
            # calculate fraction of hashtag diversity
            ## equation: hastagDiversityScore = n_{user|hashtag}/n_{content|hashtag}
            'aggregator.hastagDiversityScore': {
                '$divide': [
                    '$contributorCount', '$hashtagCount'
                ]
            },
            # calculate linear combination of engagements 
            ## equation: engagementScore = {\sigma}_{k}({\beta}_{k}*x_{k})
            'aggregator.engagementScore': {
                '$sum': [
                    {
                        '$multiply': [
                            '$commentedCount', commentedWeight
                        ]
                    }, {
                        '$multiply': [
                            '$recastedCount', recastedWeight
                        ]
                    }, {
                        '$multiply': [
                            '$quotedCount', quotedWeight
                        ]
                    }, {
                        '$multiply': [
                            '$quotedCount', quotedWeight
                        ]
                    }
                ]
            },
            # calculate decay from last update time
            ## equation: ageScore = e^(-{\lambda}*t)
            'aggregator.ageScore': {
                '$exp': {
                    '$multiply': [
                        {
                            '$multiply': [
                                {
                                    # calculate age from last update time
                                    '$divide': [
                                        {
                                            '$subtract': [datetime.utcnow(), "$updatedAt"]
                                        }, 60*60*1000
                                    ]
                                }, {
                                    # define lambda value
                                    '$divide': [{'$ln': 2}, halfLifeHours]
                                }
                            ]
                        }, -1
                    ]
                }
            }            
        }
    }, {
        # summarize all scores
        ## equation: score = ageScore*(engagementScore + 1)*(hastagDiversityScore)
        '$addFields': {
            'score': {
                '$multiply': [
                    '$aggregator.hastagDiversityScore',
                    # add 1 as bias
                    {'$add': ['$aggregator.engagementScore', 1]},
                    '$aggregator.ageScore'
                ]
            }
        }   
#     }, {
#         # upsert to 'hashtagStats' collection
#         '$merge': {
#             'into': {
#                 'db': 'analytics-db', 
#                 'coll': 'hashtagStats'
#             }, 
#             'on': '_id', 
#             'whenMatched': 'replace', 
#             'whenNotMatched': 'insert'
#         }
    }
]

In [7]:
# clear the output
clear_output()

# print output
pprint(list(contents.aggregate(cursor)))

[{'__v': 0,
  'aggregator': {'ageScore': 0.0441625540176349,
                 'contributions': [{'_id': ObjectId('6149a7c33e6269b3adabc900'),
                                    'contents': [ObjectId('615ad8d5ae2035074d2d988f'),
                                                 ObjectId('615b2d37ae203546822d9b51'),
                                                 ObjectId('615b7a8dae2035647c2d9cf9'),
                                                 ObjectId('615b7a8eae2035813c2d9d01'),
                                                 ObjectId('615b8195ae20356d0b2d9d21'),
                                                 ObjectId('615bd5f5ae20354c922d9e55'),
                                                 ObjectId('615bd5f5ae20353e402d9e5d'),
                                                 ObjectId('615c3865ae20350b232da08a'),
                                                 ObjectId('615c3865ae203542ac2da092'),
                                                 ObjectId('615c699dae203512

                                    'contents': [ObjectId('6155b5fd8bcb97424b487272')],
                                    'contributionCount': 1}],
                 'engagementScore': 0,
                 'hastagDiversityScore': 1.0},
  'createdAt': datetime.datetime(2021, 9, 30, 13, 5, 1, 554000),
  'name': 'napoli',
  'score': 7.684421069608669e-05,
  'updatedAt': datetime.datetime(2021, 9, 30, 13, 5, 1, 554000)},
 {'__v': 0,
  'aggregator': {'ageScore': 0.07983309942626282,
                 'contributions': [{'_id': ObjectId('614addffec903a9d987eb580'),
                                    'contents': [ObjectId('6162a5b5a52e723df817156b'),
                                                 ObjectId('6162ec05a52e72f657171651')],
                                    'contributionCount': 2}],
                 'engagementScore': 0,
                 'hastagDiversityScore': 0.5},
  'createdAt': datetime.datetime(2021, 10, 10, 8, 35, 1, 247000),
  'name': 'worldmentalhealthday',
  'score': 0.

  'createdAt': datetime.datetime(2021, 10, 5, 22, 5, 1, 676000),
  'name': 'lautaro',
  'score': 0.003188950704434235,
  'updatedAt': datetime.datetime(2021, 10, 5, 22, 5, 1, 676000)},
 {'__v': 0,
  'aggregator': {'ageScore': 0.1982830679208155,
                 'contributions': [{'_id': ObjectId('614addffec903a9d987eb580'),
                                    'contents': [ObjectId('6159c69e248bfd6c1bf852e0'),
                                                 ObjectId('615c0025ae20354e792d9f13'),
                                                 ObjectId('615e9516db45d07a7c925b2b'),
                                                 ObjectId('6161a18ea52e7269d7171289'),
                                                 ObjectId('6164a6fd17d82556df0799b3')],
                                    'contributionCount': 5}],
                 'engagementScore': 0,
                 'hastagDiversityScore': 0.2},
  'createdAt': datetime.datetime(2021, 10, 3, 15, 5, 2, 505000),
  'name': 'photooftheday

                                                 ObjectId('6163c4d1a52e7230ff17188b'),
                                                 ObjectId('6163c4d1a52e72c835171893'),
                                                 ObjectId('6163c856a52e72ab531718a5'),
                                                 ObjectId('6163cbda25d4859788fc9851'),
                                                 ObjectId('6163cbda25d4857429fc9859'),
                                                 ObjectId('6163cbdb25d485b893fc9861'),
                                                 ObjectId('6163cf5e25d485df9cfc9869'),
                                                 ObjectId('6163cf5e25d4852e11fc9871'),
                                                 ObjectId('6163d2e125d4857458fc9881'),
                                                 ObjectId('6163d66525d485d034fc9889'),
                                                 ObjectId('6163d66525d485a9f9fc9891'),
                                           