# Web Scraping Code

This is the code that will be used for the various python scrapers. Each section will detail out its own scenario-specific code. I expect we'll have specific code for each type of scraper.

In [1]:
import requests
import json
from time import sleep, time
import random # to set sleep values

# Meetup

In [2]:
sources = ["https://www.meetup.com/ApolloMeteorNYC/events/", "https://www.hellomonday.com/", "https://www.meetup.com/ApolloMeteorNYC/members/"]
mgroups = ["Data-Science-Panel","ApolloMeteorNYC","sfomug","New-York-MongoDB-User-Group"]

all_members = []
group_listing =[]

total_members = [1505,2866,3146,6575]
total_members = [25,100,42,13]


PAGESIZE = 30

### Meetup member listings

A short description here.

In [3]:
def getOne():
    
    web_r = requests.get("https://www.meetup.com/mu_api/urlname/members?queries=%28endpoint%3Agroups%2FApolloMeteorNYC%2Fmembers%2Clist%3A%28dynamicRef%3Alist_groupMembers_apollometeornyc_all%2Cmerge%3A%28isReverse%3A%21f%29%29%2Cmeta%3A%28method%3Aget%29%2Cparams%3A%28filter%3Aall%2Cpage%3A1%2Csort%3Ajoined%29%2Cref%3AgroupMembers_apollometeornyc_all%29")
    # what we lose in going so deep is the assocition with the specific group
    # so it may be better to keep it at just web_r.json().get("responses")[0]
    # outer_list = web_r.json().get("responses")[0].get('value').get('value')
    outer_list = web_r.json().get("responses")[0]
    
    print(len(outer_list))
    return outer_list

all_members.append(getOne())

3


In [4]:
len(all_members)
#all_members[1]
len(all_members[0])

3

In [5]:
# requires a match pattern and a meetup name.

# page_grabs calculates the number of calls required to the XHR URL to get all members.

def page_grabs(total_members):
    total_pages = total_members//PAGESIZE
    if total_members%PAGESIZE > 0:
        total_pages = total_pages + 1
    print(f"Making {total_pages} fetches\n")
    return total_pages

def makeXhrUrl(meetup_group, page):
    source_url = "https://www.meetup.com/mu_api/urlname/members?queries=%28endpoint%3Agroups%2F{meetup_group}%2Fmembers%2Clist%3A%28dynamicRef%3Alist_groupMembers_{meetup_group_lc}_all%2Cmerge%3A%28isReverse%3A%21f%29%29%2Cmeta%3A%28method%3Aget%29%2Cparams%3A%28filter%3Aall%2Cpage%3A{page}%2Csort%3Ajoined%29%2Cref%3AgroupMembers_{meetup_group_lc}_all%29".format(meetup_group=meetup_group, page=page, meetup_group_lc=meetup_group.lower())
    #print("Assemble the URL and inject the appropriate parameters; start on the first page")
    return source_url
    

def download_listing(meetup_group, total_members):
    filename = "data/{group}.json".format(group=meetup_group)
    total_pages = page_grabs(total_members)
    try:
        with open("{fn}".format(fn=filename),"w") as outfile:
            for page in range(total_pages):
                web_req = requests.get(makeXhrUrl(meetup_group, page))
                all_members.append(web_req.json().get("responses")[0])
                duration = random.randint(2, 13)
                sleep(duration)
                print(f"Slept {duration} after fetching {meetup_group} page {page} listing!")
            json.dump(all_members, outfile)

    except:
        print("Encountered a problem writing while iterating.")
        
    print(f"Saved all_members data to {outfile.name}.\n\n")        
    print(all_members)

    return all_members
    

### Meetup classic profile page details

Make sure to scrape these pages with a logged-in meetup.com account, and you will get the full member names. So use Selenium.

In [6]:
from selenium import webdriver
from bs4 import BeautifulSoup

In [7]:
# some code here
def grabThePage(classic_profile, pattern):
    pass

### Meetup new profile page details

Make sure to scrape these pages with a logged-in meetup.com account, and you will get the full member names. So use Selenium.

In [8]:
# some code here too

## Test drivers

The code below will be used to thest the functions written above. They also include test data 

In [9]:

for i, mgroup in enumerate(mgroups):
    print(i, mgroup, total_members[i], page_grabs(total_members[i]))

Making 1 fetches

0 Data-Science-Panel 25 1
Making 4 fetches

1 ApolloMeteorNYC 100 4
Making 2 fetches

2 sfomug 42 2
Making 1 fetches

3 New-York-MongoDB-User-Group 13 1


In [10]:
page_grabs(total_members[1])

Making 4 fetches



4

In [11]:
duration = random.randint(2, 13)
sleep(duration)
print(f"Slept {duration}!")

Slept 11!


In [12]:
# makeXhrUrl(mgroups[0],1)
all_members = []

for i, mgroup in enumerate(mgroups):
    download_listing(mgroup,total_members[i])

Making 1 fetches

Slept 10 after fetching Data-Science-Panel page 0 listing!
Saved all_members data to data/Data-Science-Panel.json.


[{'ref': 'groupMembers_data-science-panel_all', 'value': {'value': [{'id': '196257809', 'joined': '2020-04-12T00:13:50.000Z', 'name': 'Anshul V.', 'role': '', 'status': 'active', 'intro': '', 'title': ''}, {'id': '277521108', 'joined': '2020-03-17T00:33:46.000Z', 'name': 'Anu K.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '286543558', 'highres_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/highres_286543558.jpeg', 'thumb_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/thumb_286543558.jpeg'}}, {'id': '301447934', 'joined': '2020-02-27T03:47:41.000Z', 'name': 'James M.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '296899633', 'highres_link': 'https://secure.meetupstatic.com/photos/member/4/c/b/1/highres_296899633.jpeg', 'thumb_link': 'https://secure.meetupstatic.co

Slept 7 after fetching ApolloMeteorNYC page 0 listing!
Slept 6 after fetching ApolloMeteorNYC page 1 listing!
Slept 5 after fetching ApolloMeteorNYC page 2 listing!
Slept 8 after fetching ApolloMeteorNYC page 3 listing!
Saved all_members data to data/ApolloMeteorNYC.json.


[{'ref': 'groupMembers_data-science-panel_all', 'value': {'value': [{'id': '196257809', 'joined': '2020-04-12T00:13:50.000Z', 'name': 'Anshul V.', 'role': '', 'status': 'active', 'intro': '', 'title': ''}, {'id': '277521108', 'joined': '2020-03-17T00:33:46.000Z', 'name': 'Anu K.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '286543558', 'highres_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/highres_286543558.jpeg', 'thumb_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/thumb_286543558.jpeg'}}, {'id': '301447934', 'joined': '2020-02-27T03:47:41.000Z', 'name': 'James M.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '296899633', '

Slept 7 after fetching sfomug page 0 listing!
Slept 4 after fetching sfomug page 1 listing!
Saved all_members data to data/sfomug.json.


[{'ref': 'groupMembers_data-science-panel_all', 'value': {'value': [{'id': '196257809', 'joined': '2020-04-12T00:13:50.000Z', 'name': 'Anshul V.', 'role': '', 'status': 'active', 'intro': '', 'title': ''}, {'id': '277521108', 'joined': '2020-03-17T00:33:46.000Z', 'name': 'Anu K.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '286543558', 'highres_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/highres_286543558.jpeg', 'thumb_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/thumb_286543558.jpeg'}}, {'id': '301447934', 'joined': '2020-02-27T03:47:41.000Z', 'name': 'James M.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '296899633', 'highres_link': 'https://secure.meetupstatic.com/photos/member/4/c/b/1/highres_296899633.jpeg', 'thumb_link': 'https://secure.meetupstatic

Slept 10 after fetching New-York-MongoDB-User-Group page 0 listing!
Saved all_members data to data/New-York-MongoDB-User-Group.json.


[{'ref': 'groupMembers_data-science-panel_all', 'value': {'value': [{'id': '196257809', 'joined': '2020-04-12T00:13:50.000Z', 'name': 'Anshul V.', 'role': '', 'status': 'active', 'intro': '', 'title': ''}, {'id': '277521108', 'joined': '2020-03-17T00:33:46.000Z', 'name': 'Anu K.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '286543558', 'highres_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/highres_286543558.jpeg', 'thumb_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/thumb_286543558.jpeg'}}, {'id': '301447934', 'joined': '2020-02-27T03:47:41.000Z', 'name': 'James M.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '296899633', 'highres_link': 'https://secure.meetupstatic.com/photos/member/4/c/b/1/highres_296899633.jpeg', 'thumb_link': 'https://secure.meetupstatic.co

In [13]:
len(all_members)
#all_members[132]

8

In [14]:
all_members[3].get('ref')

'groupMembers_apollometeornyc_all'

In [15]:
with open("data/raw_member_list.json","w") as outfile:
    allofthem = []
    for i in range(len(all_members)):
        allofthem.append(all_members[i].get('value').get('value'))
        print(f"\n{allofthem}\n\npass {i}...{len(allofthem[0])}\n\n")
        
    json.dump(allofthem, outfile)
    
print(f"Saved all_members data to {outfile.name}.\n\n")


print(len(allofthem))


with open("data/raw_member_list.json","r") as infile:
    read_data = json.load(infile)
    # print(len(read_data))





[[{'id': '196257809', 'joined': '2020-04-12T00:13:50.000Z', 'name': 'Anshul V.', 'role': '', 'status': 'active', 'intro': '', 'title': ''}, {'id': '277521108', 'joined': '2020-03-17T00:33:46.000Z', 'name': 'Anu K.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '286543558', 'highres_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/highres_286543558.jpeg', 'thumb_link': 'https://secure.meetupstatic.com/photos/member/a/a/2/6/thumb_286543558.jpeg'}}, {'id': '301447934', 'joined': '2020-02-27T03:47:41.000Z', 'name': 'James M.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id': '296899633', 'highres_link': 'https://secure.meetupstatic.com/photos/member/4/c/b/1/highres_296899633.jpeg', 'thumb_link': 'https://secure.meetupstatic.com/photos/member/4/c/b/1/thumb_296899633.jpeg'}}, {'id': '176589072', 'joined': '2020-02-19T03:42:52.000Z', 'name': 'Austin P.', 'role': '', 'status': 'active', 'intro': '', 'title': '', 'photo': {'id'

In [24]:

def clean_listing(meetup_group, total_members):
    fname = "data/{group}.array.json".format(group=meetup_group)
    total_pages = page_grabs(total_members)
    print(total_pages)
    #return
#'''    
    try:
        with open("{fn}".format(fn=fname),"w") as outfile:
            all_the_data = []
            for page in range(total_pages):
                all_the_data.append(all_members[page].get('value').get('value'))
                print(f"Saved {mgroup} member data to {outfile.name}. Parse this file to get what you need")        
            json.dump(all_the_data, outfile)
    except:
        print("Encountered a problem writing while iterating.")

    print(len(all_the_data))
    return all_the_data

#'''
#ttx.get('value')

In [29]:
for i, mgroup in enumerate(mgroups):
    clean_listing(mgroup,total_members[i])
    
sleep(10)

with open("data/Data-Science-Panel.array.json","r") as infile:
    read_data = json.load(infile)
    print("\n\n")
    print(len(read_data))
    


Making 1 fetches

1
Saved Data-Science-Panel member data to data/Data-Science-Panel.array.json. Parse this file to get what you need
1
Making 4 fetches

4
Saved ApolloMeteorNYC member data to data/ApolloMeteorNYC.array.json. Parse this file to get what you need
Saved ApolloMeteorNYC member data to data/ApolloMeteorNYC.array.json. Parse this file to get what you need
Saved ApolloMeteorNYC member data to data/ApolloMeteorNYC.array.json. Parse this file to get what you need
Saved ApolloMeteorNYC member data to data/ApolloMeteorNYC.array.json. Parse this file to get what you need
4
Making 2 fetches

2
Saved sfomug member data to data/sfomug.array.json. Parse this file to get what you need
Saved sfomug member data to data/sfomug.array.json. Parse this file to get what you need
2
Making 1 fetches

1
Saved New-York-MongoDB-User-Group member data to data/New-York-MongoDB-User-Group.array.json. Parse this file to get what you need
1


JSONDecodeError: Extra data: line 1 column 9542 (char 9541)