# Use tweets for evaluating the performances of an IR system

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

In [2]:
import pymongo

## Create dataset and ground truth

In [3]:
db = pymongo.MongoClient()['twitter']['tweets']

In [4]:
tweets = list(db.find())

In [6]:
corpus = dict([(tweet['id'], tweet['text']) for tweet in tweets])

In [12]:
entities, domains = {}, {}
metadata = []
for i, tweet in enumerate(tweets):
    if 'context_annotations' in tweet.keys():
        for annotation in tweet['context_annotations']:
            domain, entity = annotation['domain'], annotation['entity']
            if domain['id'] not in domains:
                domains[domain['id']] = domain
            if entity['id'] not in entities:
                entities[entity['id']] = entity
            metadata.append({
                'entity': entity['id'], 'domain': domain['id'], 'tweet': tweet['id']
            })
M = pd.DataFrame(metadata)

In [19]:
E = pd.DataFrame(entities).T
D = pd.DataFrame(domains).T

In [20]:
E.head(2)

Unnamed: 0,id,name,description
1220701888179359745,1220701888179359745,COVID-19,
10000277815,10000277815,English Premier League Soccer,Action from English Premier League Soccer matc...


In [21]:
D.head(2)

Unnamed: 0,id,name,description
123,123,Ongoing News Story,Ongoing News Stories like 'Brexit'
3,3,TV Shows,Television shows from around the world


In [22]:
M.head(2)

Unnamed: 0,entity,domain,tweet
0,1220701888179359745,123,1463847474049789952
1,10000277815,3,1463839920204787716


## Expected results

In [26]:
entity_queries = list(set(E.name.values))
domain_queries = list(set(D.name.values))

In [31]:
def get_entity_results(query):
    etweets = set()
    entity_ids = E[E.name==query].id.values
    for eid in entity_ids:
        etweets = etweets.union(set(M[M.entity==eid].tweet.values))
    return etweets

def get_domain_results(query):
    etweets = set()
    domain_ids = D[D.name==query].id.values
    for eid in domain_ids:
        etweets = etweets.union(set(M[M.domain==eid].tweet.values))
    return etweets

In [35]:
len(get_domain_results('TV Shows'))

2042

In [36]:
search_base = [(x, corpus[x]) for x in M.tweet.values]

In [37]:
len(search_base)

63005