# Analyze and prepare the facebook dataset

### 1. Import the libraries

In [2]:
# This Source Code Form is subject to the terms of the MPL
# License. If a copy of the same was not distributed with this
# file, You can obtain one at
# https://github.com/akhilpandey95/altpred/blob/master/LICENSE.

import csv
import glob
import json
import numpy as np
from tqdm import tqdm
import dask.dataframe as dd

%matplotlib inline

### 2. Read the data

In [3]:
# read the directory
path = '/media/hector/DATA/datalab-data/combined_file/keys/*/*.txt'
final = '/media/hector/DATA/datalab-data/facebook_j2018_full.csv'

# use glob to read all the files from the path
files = glob.glob(path)

### 3. Analyze the data

#### 3.1 Print out the prospective column names for the `posts` category

In [4]:
## read a sample file
f = open('/media/hector/DATA/datalab-data/combined_file/keys/100/10029320.txt').readlines()

## check the keys in the dict
d_1 = json.loads(f[29])

## print the keys for facebook
print("Keys in the dict 'posts{facebook}'", d_1['posts']['facebook'][0].keys())

Keys in the dict 'posts{facebook}' dict_keys(['license', 'title', 'url', 'author', 'summary', 'citation_ids', 'posted_on'])


### 4. Script for writing the data into a csv

In [5]:
# write the file to a csv
'''
{
  'license': 'public',
  'title': 'Transfer of Methylprednisolone into Breast Milk in a Mother with Multiple Sclerosis',
  'url': 'https://www.facebook.com/permalink.php?story_fbid=1027446043938962&id=508976565785915',
  'author': {'url': 'https://www.facebook.com/508976565785915',
   'facebook_wall_name': 'Journal of Human Lactation',
   'image': 'https://graph.facebook.com/508976565785915/picture',
   'id_on_source': '508976565785915',
   'name': 'Journal of Human Lactation'},
  'summary': 'New Papers OnlineFirst at JHL!\n\nCase Report\n\nTransfer of Methylprednisolone into Breast Milk in a Mother with Multiple Sclerosis\n\nBy Cooper et al.\nhttp://jhl.sagepub.com/content/early/2015/02/12/0890334415570970.full\n\n\nInsights in Policy\n\nHow Research on C',
  'citation_ids': [3713571, 3713588],
  'posted_on': '2015-02-18T23:00:01+00:00'
}
'''
with open(final, 'w') as final_file:
    datawriter = csv.writer(final_file, delimiter=',',quotechar='|', 
                            quoting=csv.QUOTE_MINIMAL)
    datawriter.writerow(['altmetric_id', 'fb_post_title', 'fb_post_url',
                         'fb_post_description', 'fb_post_date', 'fb_author_url',
                         'fb_author_name', 'paper_title', 'paper_abstract', 
                         'paper_doi', 'paper_pubdate', 'paper_subjects', 
                         'paper_publisher_subjects', 'paper_scopus_subjects'])
    
    # the exception handling block
    for file in tqdm(files):
        try:
            with open(file) as f:
                for text_data in f.readlines():
                    data = json.loads(text_data)
                    if 'altmetric_id' in data:
                        altmetric_id = data['altmetric_id']
                        if 'posts' in data and len(data['posts']) > 0:
                            if 'facebook' in data['posts'] and 'title' in data['posts']['facebook'] and isinstance(data['posts']['facebook']['title'], str):
                                fb_post_title = data['posts']['facebook']['title']
                            else:
                                fb_post_title = ''
                            if 'facebook' in data['posts'] and 'url' in data['posts']['facebook'] and isinstance(data['posts']['facebook']['url'], str):
                                fb_post_url = data['posts']['facebook']['url']
                            else:
                                fb_post_url = ''
                            if 'facebook' in data['posts'] and 'summary' in data['posts']['facebook'] and isinstance(data['posts']['facebook']['summary'], str):
                                fb_post_description = data['posts']['facebook']['summary']
                            else:
                                fb_post_description = ''
                            if 'facebook' in data['posts'] and 'posted_on' in data['posts']['facebook'] and isinstance(data['posts']['facebook']['posted_on'], str):
                                fb_post_date = data['posts']['facebook']['posted_on']
                            else:
                                fb_post_date = ''
                            if 'facebook' in data['posts'] and 'author' in data['posts']['facebook'] and 'url' in data['posts']['facebook']['author'] and isinstance(data['posts']['facebook']['author']['url'], str):
                                fb_author_url = data['posts']['facebook']['author']['url']
                            else:
                                fb_author_url = ''
                            if 'facebook' in data['posts'] and 'author' in data['posts']['facebook'] and 'name' in data['posts']['facebook']['author'] and isinstance(data['posts']['facebook']['author']['name'], str):
                                fb_author_name = data['posts']['facebook']['author']['name']
                            else:
                                fb_author_name = ''
                            if 'title' in data['citation'] and isinstance(data['citation']['title'], str):
                                paper_title = data['citation']['title']
                            else:
                                paper_title = ''
                            if 'abstract' in data['citation'] and isinstance(data['citation']['abstract'], str):
                                paper_abstract = data['citation']['abstract']
                            else:
                                paper_abstract = ''
                            if 'doi' in data['citation'] and isinstance(data['citation']['doi'], str):
                                paper_doi = data['citation']['doi']
                            else:
                                paper_doi = ''
                            if 'pubdate' in data['citation'] and isinstance(data['citation']['pubdate'], str):
                                paper_pubdate = data['citation']['pubdate']
                            else:
                                paper_pubdate = ''
                            if 'subjects' in data['citation'] and isinstance(data['citation']['subjects'], str):
                                paper_subjects = data['citation']['subjects']
                            else:
                                paper_subjects = ''
                            if 'publisher_subjects' in data['citation'] and isinstance(data['citation']['publisher_subjects'], str):
                                paper_publisher_subjects = data['citation']['publisher_subjects']
                            else:
                                paper_publisher_subjects = ''
                            if 'scopus_subjects' in data['citation'] and isinstance(data['citation']['scopus_subjects'], str):
                                paper_scopus_subjects = data['citation']['scopus_subjects']
                            else:
                                paper_scopus_subjects = ''
                            datawriter.writerow([altmetric_id, fb_post_title, fb_post_url,
                         fb_post_description, fb_post_date, fb_author_url,
                         fb_author_name, paper_title, paper_abstract, 
                         paper_doi, paper_pubdate, paper_subjects, 
                         paper_publisher_subjects, paper_scopus_subjects])
        except IOError as exc:
            if exc.errno != errno.EISDIR:
                raise

100%|██████████| 380518/380518 [2:34:52<00:00, 40.95it/s]  
