# Yahoo Groups message scraper

This notebook provides code that will dowload messages from *private* Yahoo Groups. Another notebook `YahooScra.jpynb` uses [Selenium](http://selenium-python.readthedocs.io/). This notebook uses [reqeusts](http://docs.python-requests.org/en/latest/user/install/).

Another GitHub.com repository ([YahooGroups-Archiver](https://github.com/andrewferguson/YahooGroups-Archiver)) provided guidance related to accessing *private* Yahoo Groups. After logging into the private group you need to provide two pieces of information from the cookies:

> Cookie information can be found through the use of a plug-in for
> your web browser. (I use 'Cookie Manager' on FireFox, although
> there are many other options for FireFox and other browsers). The
> two cookies you are looking for are called Y and T, and they are 
> linked to the domain yahoo.com. Extract the data from these 
> cookies, and paste it into the appropriate variables... a cookie
> will expire after a certain amount of time, which varies between 
> computers. This means that you may have to re-fetch the Y and T 
> cookie data every few days, or you will not be able to archive 
> private groups. ([YahooGroups-Archiver](https://github.com/andrewferguson/YahooGroups-Archiver))

(Last tested April 21, 2018)

In [None]:
import pandas as pd
from pandas import Series, DataFrame
from bs4 import BeautifulSoup
import requests
import json
import datetime
import time
import os

In [None]:
# Get groupname from user.
grp_name = input(prompt='What is the group name you seek to scrape?  ')
# Give grp_name a default if no input.
if grp_name == '':
    grp_name = 'concatenative'

In [None]:
# These variables will take the cookies from 'Cookie Manager' discussed above as strings.
cookie_T = ''
cookie_Y = ''

# Define a list to log successful attempts.
fetch_log = []

# Iterate through the private group messages. The most recent group message number 
# can be found at: https://groups.yahoo.com/api/v1/groups/<groupname>/messages?count=10
# Parse the results to discern the most recent message number.
for i in range(120,100,-1):
    fetch_log.append(
        ('{} Working on message {}.'.format(
            str(datetime.datetime.now()), str(i))))
    # For demonstration purposes use a generic groupsuch as 'concatenative'
    post = requests.get(
        ''.join(
            (r'https://groups.yahoo.com/api/v1/groups/', grp_name, r'/messages/', str(i), r'/')),
             cookies={'T': cookie_T, 'Y': cookie_Y})
    
    # Yahoo Groups api returns JSON.
    post_parsed = json.loads(post.text)
    
    # The api result uses html in the messageBody.
    soup = BeautifulSoup(post_parsed['ygData']['messageBody'], 'html.parser')
    soupstring = soup.get_text()
    
    # Optionally remove comments from html
    # pullstring = soupstring[soupstring.find('<!--'):soupstring.find('-->')+3]
    # cleanstring = soupstring.replace(pullstring,'')
    
    # Save the message body as a .txt file.
    post_file = open(
        os.path.join(
            'msgs', ''.join(
                (grp_name + '_' + str(i) + r'_post.txt'))), 'w', encoding='utf-8')
    # post_file.write(cleanstring)
    post_file.write(soupstring)
    post_file.close()
    
    # Save the api result as a .json file.
    json_file = open(
        os.path.join(
            'msgs', ''.join(
                (grp_name + '_' + str(i) + r'_json.json'))), 'w', encoding='utf-8')
    json_file.write(post.text)
    json_file.close()
    
    # Optionally pause to assist in avoiding CAPTCHA and other anti-robot features.
    time.sleep(.1)

In [None]:
# Save fetch_log for later reference.
with open(grp_name + '_fetch_log_' + 
        str(datetime.datetime.now())[2:16].replace(" ", "-").replace(":","") + 
        '.log', mode='w') as logfile:
            print('This is the log of fetched messages file from {}'.format(
                str(datetime.datetime.now())), file=logfile)
            print('Yahoo Group name {}.'.format(grp_name), file=logfile)
            for fetch_line in fetch_log:
                print(fetch_line, file = logfile)
logfile.close

In [None]:
# Get location of messages to compile.
folder_location = input(prompt='What is the of messages (no input will default to msgs)?  ')
# Give grp_name a default if no input.
if folder_location == '':
    folder_location = 'msgs'

In [None]:
# Build a dataset from the files created above.

# Define a list to log errors.
error_log  = []

# Define a list to hold structured data.
grandlist = []

# Iterate through the message files.
for i in range(32811,32800,-1):
    try:
        work_file = open(
            os.path.join(
                folder_location, ''.join((grp_name, '_', str(i), '_json.json'))),'r', encoding='utf-8')
        work_parse = json.loads(work_file.read())
        
        # Define list to hold current record.
        mylist = []
        mylist = [
            work_parse['ygData']['userId'],
            work_parse['ygData']['authorName'],
            work_parse['ygData']['subject'],
            work_parse['ygData']['postDate'],
            str(datetime.datetime.fromtimestamp(
                int(work_parse['ygData']['postDate'])).strftime('%Y-%m-%d %H:%M:%S')),
            work_parse['ygData']['msgId'],
            work_parse['ygData']['prevInTopic'],
            work_parse['ygData']['nextInTopic'],
            work_parse['ygData']['prevInTime'],
            work_parse['ygData']['nextInTime'],
            work_parse['ygData']['topicId'],
            work_parse['ygData']['numMessagesInTopic']]
        work_file.close()
        
        # Optionally add the message body to the current observation.
        # My use case involved adding the message body using Stata instead of Python.
        work_file = open(
            os.path.join(
                folder_location, ''.join((grp_name, '_', str(i), '_post.txt'))), 'r', encoding='utf-8')
        mylist.append(work_file.read())
        work_file.close()

        # Add the current observation to the structured data set.
        grandlist.append(mylist)
    except FileNotFoundError:
        # If file not found, provie output and log error.
        print('Message number ' + str(i) + ' - Not found.')
        error_log.append('FileNotFoundError. Message number {}.'.format(str(i)))
    except KeyError:
        # If any of the JSON keys (variables) not found, provie output and log error.
        print('Message number ' + str(i) + ' - KeyError.')
        error_log.append('KeyError. Message number '.format(str(i)))
    except OSError:
        # If OSError, provie output and log error.
        print('Message number ' + str(i) + ' - OSError.')
        error_log.append('OSError. Message number '.format(str(i)))

In [None]:
# Save error_log for later reference.
with open(grp_name + '_err_log_' + 
        str(datetime.datetime.now())[2:16].replace(" ", "-").replace(":","") +
        '.log', mode='w') as logfile:
            print('This is the error log file from {}'.format(
                str(datetime.datetime.now())), file = logfile)
            print('Yahoo Group name {}.'.format(grp_name), file = logfile)
            for error_line in error_log:
                print(error_line, file = logfile)
logfile.close

In [None]:
# Put structured data into a Pandas dataframe.
grand_df = DataFrame(grandlist, 
                     columns=['userId','authName','subject','Unix','Date',
                              'msgId','preInTpc','nxtInTpc','preInTime',
                              'nxtInTime','topicId','MssgsInTopic','msgBody'])

In [None]:
# Check results.
grand_df.head()

In [None]:
# Save to CSV
grand_df.to_csv(grp_name + '_messages.csv')

In [None]:
# Save to Stata
# Problems on this. See: https://github.com/pandas-dev/pandas/issues/16450
grand_df.to_stata(grp_name + '_messages.dta')

In [None]:
# Save to Excel
writer = pd.ExcelWriter(grp_name + '_messages.xlsx', engine='xlsxwriter')
grand_df.to_excel(writer, sheet_name='Sheet1')
writer.save()