In [1]:
import pymongo
import os
import pandas as pd
from pandas.io.json import json_normalize

import yaml
from collections import Counter
from datetime import datetime
import sys
SRC = os.path.join(os.path.dirname(os.path.dirname(os.getcwd())), "src")
sys.path.append(SRC)
from content_api.details_utils import extract_from_details, cs_extract_text, cs_extract_links

In [2]:
### Get dirs

In [3]:
DATA_DIR = os.getenv("DATA_DIR")
config = os.path.join(SRC, "config")
black_list_path = os.path.join(config, "document_types_excluded_from_the_topic_taxonomy.yml")

In [4]:
with open(black_list_path, 'r') as stream:
    blacklisted_content_page = sorted(yaml.load(stream)['document_types'])
blacklisted_content_page[0:5]

['about',
 'about_our_services',
 'access_and_opening',
 'business_support_finder',
 'coming_soon']

In [5]:
### Get database running locally

In [6]:
myclient = pymongo.MongoClient("mongodb://localhost:27017/")

In [7]:
print(myclient.list_database_names())

['admin', 'config', 'content_items', 'local']


In [8]:
mydb = myclient["content_items"]
mycol = mydb["content_items"]

In [10]:
# whitelisted_doctypes_filter = {"$elemMatch": { "document_type": {"$nin":blacklisted_content_page}, "phase":"live"}}
whitelisted_doctypes_filter = {"document_type": {"$nin": blacklisted_content_page}}

filter_ = { "$and": [{"$or": [{"expanded_links.ordered_related_items": {"$exists": True}}, {"expanded_links.documents": {"$exists": True}}]},
                    { "document_type": {"$nin": blacklisted_content_page}},
                    { "phase": "live"}]}


filter_related_links = { "$and": [{"expanded_links.ordered_related_items": {"$exists": True}},
                    { "document_type": {"$nin": blacklisted_content_page}},
                    { "phase": "live"}]}

filter_collection_links = { "$and": [{"expanded_links.documents": {"$exists": True}},
                    { "document_type": {"$nin": blacklisted_content_page}},
                    { "phase": "live"}]}

In [11]:
keep_projection = { 
    "expanded_links.ordered_related_items.base_path": 1,
    "expanded_links.documents.base_path": 1,
    "expanded_links.ordered_related_items.content_id": 1,
    "expanded_links.documents.content_id": 1,
    "content_id": 1}

In [12]:
related_links_json = list(mycol.find(filter_related_links, keep_projection))

In [13]:
collection_links_json = list(mycol.find(filter_collection_links, keep_projection))

In [20]:
related_links_df = json_normalize(related_links_json, 
                    record_path=[['expanded_links', 'ordered_related_items']], 
                    meta=['_id', 'content_id'],
                                  meta_prefix='source_'
                    )

In [21]:
related_links_df.columns = ['destination_base_path', 'destination_content_id', 'source_base_path', 'source_content_id']
related_links_df['link_type'] = 'related_link'

In [22]:
collection_links_df = json_normalize(collection_links_json, 
                    record_path=[['expanded_links', 'documents']], 
                    meta=['_id', 'content_id'], 
                                  meta_prefix='source_'
                    )

In [29]:
collection_links_df.columns = ['destination_base_path', 'destination_content_id', 'source_base_path', 'source_content_id'] 
collection_links_df['link_type'] = 'collection_link'

In [30]:
all_links = pd.concat([related_links_df, collection_links_df], axis=0, sort=False)

In [35]:
all_links

Unnamed: 0,destination_base_path,destination_content_id,source_base_path,source_content_id,link_type
0,/courses-qualifications,b4294926-5ff6-4349-8d0d-4a7d54e5de52,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,related_link
1,/learner-support,14bfeca4-b490-4e73-8149-08e8a30d36e9,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,related_link
2,/subsidised-college-transport-16-19,486159ab-fac6-4397-afd4-cc3b994d8ed6,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,related_link
3,/education-maintenance-allowance-ema,8ff35813-edb9-4727-b669-661956ce3d52,/1619-bursary-fund,f4b96a38-5247-4afd-b554-8a258a0e8c93,related_link
4,/unfair-terms-in-sales-contracts,d81a9f29-dffc-4de9-9ba0-b4bdac3d1d89,/accepting-returns-and-giving-refunds,87c4cd96-bd6c-4151-911f-79b2cdf21c46,related_link
5,/online-and-distance-selling-for-businesses,38bc84a8-98a4-42ae-942d-87568d2a83d4,/accepting-returns-and-giving-refunds,87c4cd96-bd6c-4151-911f-79b2cdf21c46,related_link
6,/product-labelling-the-law,8f86935f-2864-414e-98a3-1ebcfd76cc97,/accepting-returns-and-giving-refunds,87c4cd96-bd6c-4151-911f-79b2cdf21c46,related_link
7,/business-support-helpline,42b1ec32-8258-4f42-8d1c-3851082d83c1,/accepting-returns-and-giving-refunds,87c4cd96-bd6c-4151-911f-79b2cdf21c46,related_link
8,/definition-of-disability-under-equality-act-2010,a9969b73-6217-42a2-b40f-f532d668388d,/access-to-elected-office-fund,e12e3c54-b544-4d94-ba1f-9846144374d2,related_link
9,/rights-disabled-person,ed9c6ec7-5465-40cf-b2a6-2803f5287d8d,/access-to-elected-office-fund,e12e3c54-b544-4d94-ba1f-9846144374d2,related_link
