This repository has been archived by the owner. It is now read-only.
Permalink
Cannot retrieve contributors at this time
146 lines (125 sloc)
5.65 KB
| ''' | |
| Yahoo-Groups-Archiver Copyright 2015, 2017, 2018 Andrew Ferguson and others | |
| YahooGroups-Archiver, a simple python script that allows for all | |
| messages in a public Yahoo Group to be archived. | |
| This program is free software: you can redistribute it and/or modify | |
| it under the terms of the GNU General Public License as published by | |
| the Free Software Foundation, either version 3 of the License, or | |
| (at your option) any later version. | |
| This program is distributed in the hope that it will be useful | |
| but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| GNU General Public License for more details. | |
| You should have received a copy of the GNU General Public License | |
| along with this program. If not, see <http://www.gnu.org/licenses/>. | |
| ''' | |
| cookie_T = 'COOKIE_T_DATA_GOES_HERE' | |
| cookie_Y = 'COOKIE_Y_DATA_GOES_HERE' | |
| import json #required for reading various JSON attributes from the content | |
| import requests #required for fetching the raw messages | |
| import os #required for checking if a file exists locally | |
| import time #required if Yahoo blocks access temporarily (to wait) | |
| import sys #required to cancel script if blocked by Yahoo | |
| import shutil #required for deletung an old folder | |
| import glob #required to find the most recent message downloaded | |
| import time #required to log the date and time of run | |
| def archive_group(groupName, mode="update"): | |
| log("\nArchiving group '" + groupName + "', mode: " + mode + " , on " + time.strftime("%c"), groupName) | |
| startTime = time.time() | |
| msgsArchived = 0 | |
| if mode == "retry": | |
| #don't archive any messages we already have | |
| #but try to archive ones that we don't, and may have | |
| #already attempted to archive | |
| min = 1 | |
| elif mode == "update": | |
| #start archiving at the last+1 message message we archived | |
| mostRecent = 1 | |
| if os.path.exists(groupName): | |
| oldDir = os.getcwd() | |
| os.chdir(groupName) | |
| for file in glob.glob("*.json"): | |
| if int(file[0:-5]) > mostRecent: | |
| mostRecent = int(file[0:-5]) | |
| os.chdir(oldDir) | |
| min = mostRecent | |
| elif mode == "restart": | |
| #delete all previous archival attempts and archive everything again | |
| if os.path.exists(groupName): | |
| shutil.rmtree(groupName) | |
| min = 1 | |
| else: | |
| print ("You have specified an invalid mode (" + mode + ").") | |
| print ("Valid modes are:\nupdate - add any new messages to the archive\nretry - attempt to get all messages that are not in the archive\nrestart - delete archive and start from scratch") | |
| sys.exit() | |
| if not os.path.exists(groupName): | |
| os.makedirs(groupName) | |
| max = group_messages_max(groupName) | |
| for x in range(min,max+1): | |
| if not os.path.isfile(groupName + '/' + str(x) + ".json"): | |
| print ("Archiving message " + str(x) + " of " + str(max)) | |
| sucsess = archive_message(groupName, x) | |
| if sucsess == True: | |
| msgsArchived = msgsArchived + 1 | |
| log("Archive finished, archived " + str(msgsArchived) + ", time taken is " + str(time.time() - startTime) + " seconds", groupName) | |
| def group_messages_max(groupName): | |
| s = requests.Session() | |
| resp = s.get('https://groups.yahoo.com/api/v1/groups/' + groupName + '/messages?count=1&sortOrder=desc&direction=-1', cookies={'T': cookie_T, 'Y': cookie_Y}) | |
| try: | |
| pageHTML = resp.text | |
| pageJson = json.loads(pageHTML) | |
| except ValueError as valueError: | |
| if "Stay signed in" in pageHTML and "Trouble signing in" in pageHTML: | |
| #the user needs to be signed in to Yahoo | |
| print ("Error. The group you are trying to archive is a private group. To archive a private group using this tool, login to a Yahoo account that has access to the private groups, then extract the data from the cookies Y and T from the domain yahoo.com . Paste this data into the appropriate variables (cookie_Y and cookie_T) at the top of this script, and run the script again.") | |
| sys.exit() | |
| else: | |
| raise valueError | |
| return pageJson["ygData"]["totalRecords"] | |
| def archive_message(groupName, msgNumber, depth=0): | |
| global failed | |
| failed = False | |
| s = requests.Session() | |
| resp = s.get('https://groups.yahoo.com/api/v1/groups/' + groupName + '/messages/' + str(msgNumber) + '/raw', cookies={'T': cookie_T, 'Y': cookie_Y}) | |
| if resp.status_code != 200: | |
| #some other problem, perhaps being refused access by Yahoo? | |
| #retry for a max of 3 times anyway | |
| if depth < 3: | |
| print ("Cannot get message " + str(msgNumber) + ", attempt " + str(depth+1) + " of 3 due to HTTP status code " + str(resp.status_code)) | |
| time.sleep(0.1) | |
| archive_message(groupName,msgNumber,depth+1) | |
| else: | |
| if resp.status_code == 500: | |
| #we are most likely being blocked by Yahoo | |
| log("Archive halted - it appears Yahoo has blocked you.", groupName) | |
| log("Check if you can access the group's homepage from your browser. If you can't, you have been blocked.", groupName) | |
| log("Don't worry, in a few hours (normally less than 3) you'll be unblocked and you can run this script again - it'll continue where you left off." ,groupName) | |
| sys.exit() | |
| log("Failed to retrive message " + str(msgNumber) + " due to HTTP status code " + str(resp.status_code), groupName ) | |
| failed = True | |
| if failed == True: | |
| return False | |
| msgJson = resp.text | |
| writeFile = open((groupName + "/" + str(msgNumber) + ".json"), "wb") | |
| writeFile.write(msgJson.encode('utf-8')) | |
| writeFile.close() | |
| return True | |
| global writeLogFile | |
| def log(msg, groupName): | |
| print (msg) | |
| if writeLogFile: | |
| logF = open(groupName + ".txt", "a") | |
| logF.write("\n" + msg) | |
| logF.close() | |
| if __name__ == "__main__": | |
| global writeLogFile | |
| writeLogFile = True | |
| os.chdir(os.path.dirname(os.path.abspath(__file__))) | |
| if "nologs" in sys.argv: | |
| print ("Logging mode OFF") | |
| writeLogFile = False | |
| sys.argv.remove("nologs") | |
| if len(sys.argv) > 2: | |
| archive_group(sys.argv[1], sys.argv[2]) | |
| else: | |
| archive_group(sys.argv[1]) |