In [1]:
import calendar
import time
import requests
import datetime
from bs4 import BeautifulSoup
from urllib import parse
import csv


In [2]:
# get html code, convert to plaintext
def getHTML(url):
    r = requests.get(url,verify=False)
    #r.status_code
    plain=r.text
    s = BeautifulSoup(plain, 'html.parser')
    return s

In [3]:
def getForum(s):
    fields = ['fid', 'f_url','f_heading','f_subheading','num_topics','num_posts','last_post_date','last_post_uid']
    f_post=s.find_all('tr',{'class':'post'})
    rows = []
    for tags in f_post:
        link = tags.find_all('span',{'class':'forumheading'})[0]
        a = link.find_all('a')[0]
        fname = a.contents[0]
        url = a['href']
        fid = parse.parse_qs(parse.urlsplit(url).query)['f'][0]
        
        subheading = tags.find_all('span',{'class':'subforumheading'})[0].contents[0]
        topics = tags.find_all('td')[2].contents[0]
        posts = tags.find_all('td')[3].contents[0]
        last_date = tags.find_all('td')[4].contents[0]
        if last_date != '-':
            uid = parse.parse_qs(parse.urlsplit(tags.find_all('td')[4].contents[3]['data-remote']).query)['uid'][0]
        else:
            uid = '-'

        row = [fid, url, fname,subheading,topics,posts,last_date,uid]
        rows.append(row)

    return fields, rows

def getTopics(s, fid):
    fields = ['fid','tid','t_url','t_heading','num_replies','num_views','starter_uid','starter_name','last_post_date']
    rows = []
    postlst = s.find_all('tr',{'class':'post'})
    for post in postlst:
        # print('post: ', post)
        a = post.find_all('a')[0]
        # print('a:',a)
        tname = a.contents[0]
        url = a['href']
        # print('url: ', url)
        tid = parse.parse_qs(parse.urlsplit(url).query)['t'][0]

        uname = post.find_all('a')[1].contents[0]
        uid = parse.parse_qs(parse.urlsplit(post.find_all('a')[1]['data-remote']).query)['uid'][0]
        
        rep = post.find_all('td')[3].contents[0]
        view = post.find_all('td')[4].contents[0]
        date = post.find_all('td')[5].contents[0]
        
        row = [fid,tid,url,tname,rep,view,uid,uname,date]
        rows.append(row)
    return fields, rows


In [4]:
def writeToCSV(field, rows, fname):
    with open(fname, 'w') as f:
        csvwriter = csv.writer(f)
        csvwriter.writerow(field)
        csvwriter.writerows(rows)
    f.close()

In [5]:
def crawlForumns():
    topsoup = getHTML('https://www.alzconnected.org/discussion.aspx')
    f_fields, f_rows = getForum(topsoup)
    writeToCSV(f_fields, f_rows,'forum.csv')
    return f_fields, f_rows
#f_fields, f_rows = crawlForumns()

In [6]:
def crawlTopicsbyForum(f_rows):
    for row in f_rows:
        print('getting info from forum: {}...'.format(row[2]))
        rows = []
        i = 1
        while True:
            forumURL = "https://www.alzconnected.org{}&page={}".format(row[1],i)
            s = getHTML(forumURL)
            fields, cur_row = getTopics(s,row[0])
            if len(cur_row)==0:
                print('Done with forum: {}.'.format(row[2]))
                break
            else:
                if i % 10 == 0:
                    print('Successfully getting info from {} page {}'.format(row[2],i))
                rows.extend(cur_row)
                i+=1
        writeToCSV(fields, rows, 'topics_{}.csv'.format(row[2]))
#crawlTopicsbyForum(f_rows)

In [7]:
def getPostsAndUser(s, tid):
    post_fields = ['tid', 'uid','pid','p_content','is_start','p_date']
    user_fields = ['uid','username','join_date','num_post']
    postlst = s.find_all('tr',{'class':'post'})
    userlst = s.find_all('tr',{'class':'postheader'})
    
    assert len(postlst) == len(userlst)
    posts = []
    user_dict = {}

    for i in range(len(userlst)):
        user = userlst[i]
        post = postlst[i]

        isStart = True if i == 0 else False
        pid = user.find_all('a')[0]['name'][5:]
        try:
            uid = parse.parse_qs(parse.urlsplit(user.find_all('a')[1]['data-remote']).query)['uid'][0] if i!=len(userlst)-1 else parse.parse_qs(parse.urlsplit(user.find_all('a')[2]['data-remote']).query)['uid'][0]
            uname = user.find_all('a')[1].contents[0] if i!=len(userlst)-1 else user.find_all('a')[2].contents[0]
            dp = post.find_all('td',{'class':'UserBox'})[0].text.split('Joined: ',1)[1]
            dp = dp.split('Posts: ')
            join_date = dp[0]
            post_num = [int(s) for s in dp[1].split() if s.isdigit()][0]
        except IndexError:
            print('post id {} can\'t get user information'.format(pid))
            uid = 0 
            uname = 0
            join_date = 0
            post_num = 0

        msg = post.find_all('td',{'class':'message ekMessage'})[0]
        content = msg.text
        p_date=user.find_all('td',{'class':'postheader'})[0].text.split('\t')[-4]

        post = [tid, uid,pid,content,isStart,p_date]
        posts.append(post)
        user_dict[uid] = [uid, uname, join_date, post_num]
    return post_fields, user_fields, posts, user_dict


In [12]:
"""
def getPostsbyForum(tids, forum_posts, user_all):
    try:
        idx = 0
        print("# tids: ",len(tids))
        for idx,tid in enumerate(tids):
            url = 'https://www.alzconnected.org/discussion.aspx?g=posts&t={}'.format(tid)
            soup = getHTML(url)
            post_fields, user_fields, posts, user_dict = getPostsAndUser(soup,tid)
            forum_posts.extend(posts)
            user_all = {**user_all, **user_dict}
            if verbose and idx % 1 == 0:
                print('Done with {} topics. Fetched {} posts and {} users'.format(idx, len(forum_posts),len(user_all)))
    except TimeoutError:
        print('Connection time out at index {}, reconnecting ...'.format(idx))
        time.sleep(15)
        print('Trying to reconnect ...')
        t = list(tids)
        tids = t[index:]
        print('{} posts left'.len(tids))
        getPostsbyForum(tids, forum_posts, user_all)
"""

In [18]:
import pickle

def getPostsbyForum(tids, forum_posts, user_all):
    idx = 0
    print("# tids: ",len(tids))
    for idx,tid in enumerate(tids):
        url = 'https://www.alzconnected.org/discussion.aspx?g=posts&t={}'.format(tid)
        soup = getHTML(url)
        post_fields, user_fields, posts, user_dict = getPostsAndUser(soup,tid)
        pickle.dump((posts_fields, user_fields, posts, user_dict),open("tmp/{}.pkl".format(idx),"wb"))
        print("Done id {}: {} posts, {} users".format(idx,len(posts),len(user_dict)))

In [19]:
import pandas as pd
import time
from requests.packages.urllib3.exceptions import InsecureRequestWarning
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# forum = pd.read_csv('forum.csv')
# fnames = forum.f_heading
#user_all = {}

forum_lst = ['Caregivers Forum']
for forum_name in forum_lst:
    print('Getting post info from {}...'.format(forum_name))
    df = pd.read_csv('topics_{}.csv'.format(forum_name))
    print(len(df.tid))
    tids = df.tid[5000:10000]
    forum_posts = []
    posts_fields = []
    user_all = {}
    verbose = True
    
    getPostsbyForum(tids, forum_posts, user_all)
            
    # writeToCSV(post_fields, forum_posts, 'posts_{}.csv'.format(forum_name))

# user_fields = ['uid','username','join_date','num_post']
# users = list(user_all.values())
# writeToCSV(user_fields, users, 'user_caregiver.csv')


Getting post info from Caregivers Forum...
33259
# tids:  5000


IndexError: tuple index out of range

In [30]:
# def getPostsbyForum(tids, forum_posts, user_all):
#     try:
#         idx = 0
#         for idx,tid in enumerate(tids):
#             i = 1
#             tid = 2147484140
#             while True:
#                 url = 'https://www.alzconnected.org/discussion.aspx?g=posts&t={}&page={}'.format(tid,i)
#                 print(tid,i)
#                 soup = getHTML(url)
#                 post_fields, user_fields, posts, user_dict = getPostsAndUser(soup,tid)
#                 if len(forum_posts) == 0 or posts[-1] == forum_posts[-1]:
#                     break
#                 forum_posts.extend(posts)
#                 user_all = {**user_all, **user_dict}
#                 i+=1
#             if verbose and idx % 100 == 0:
#                 print('Done with {} topics. Fetched {} posts and {} users'.format(idx, len(forum_posts),len(user_all)))
#     except TimeoutError:
#         print('Connection time out at index {}, reconnecting ...'.format(idx))
#         time.sleep(15)
#         print('Trying to reconnect ...')
#         t = list(tids)
#         tids = t[index:]
#         print('{} posts left'.len(tids))
#         getPostsbyForum(tids, forum_posts, user_all)
    

In [18]:
forum_posts

021 8:10 AM'],
 [2147555831,
  '2147764402',
  '148087246',
  '\nThank you all for the kind words and the welcome!\nAs an only child, and especially after my dad passed when I was 17, Mom and I have always been close, even when it was a very rocky closeness. People have told me how wonderful it is that I\'m doing the things I am for her--without going into too much detail I\'ll say this: Every single thing I do, she MORE than deserves. She has gone above and beyond for me throughout my adult life--through some critically bad decisions (divorce, substance abuse, dumb relationships full of drama, financial stupidity galore) as well as some perfectly reasonable decisions she just ferociously disagreed with (my current marriage, buying a house in a neighborhood she didn\'t approve of)--through all of that, she was still there for me. And as I\'ve told her--she spent her whole life caring for everyone else. Her siblings, her husband, me, her parents, me again, and again, and again....It\'s 

--------------- End of Code ----------------

In [156]:
tid_cur = 2147511224
t = list(df.tid)
index = t.index(tid_cur)
tids = t[index:]
len(tids)


5321

In [157]:

forum_posts = forum_posts
posts_fields = []
user_all = user_all
verbose = True
for idx,tid in enumerate(tids):
    url = 'https://www.alzconnected.org/discussion.aspx?g=posts&t=' + str(tid)
    soup = getHTML(url)
    post_fields, user_fields, posts, user_dict = getPostsAndUser(soup,tid)
    forum_posts.extend(posts)
    user_all = {**user_all, **user_dict}
    if verbose and idx % 100 == 0:
        print('Done with {} topics. Fetched {} posts and {} users'.format(idx, len(forum_posts),len(user_all)))

writeToCSV(post_fields, forum_posts, 'posts_{}.csv'.format(forum_name))

user_fields = ['uid','username','join_date','num_post']
users = list(user_all.values())
writeToCSV(user_fields, users, 'user.csv')

Done with 0 topics. Fetched 131452 posts and 5444 users
Done with 100 topics. Fetched 132526 posts and 5448 users
Done with 200 topics. Fetched 133698 posts and 5462 users
post id 147710981 can't get user information
post id 147710994 can't get user information
Done with 300 topics. Fetched 134689 posts and 5467 users
post id 147711039 can't get user information
Done with 400 topics. Fetched 135950 posts and 5472 users
post id 147708297 can't get user information
Done with 500 topics. Fetched 137129 posts and 5487 users
post id 147706632 can't get user information
post id 147705259 can't get user information
Done with 600 topics. Fetched 138255 posts and 5496 users
post id 147703798 can't get user information
post id 147703845 can't get user information
post id 147703102 can't get user information
post id 147703350 can't get user information
Done with 700 topics. Fetched 139416 posts and 5506 users
Done with 800 topics. Fetched 140544 posts and 5515 users
post id 147698840 can't get us

In [143]:
LGBT_Trial_user_es_have_lost = user_all

In [158]:
len(user_all)

6367

In [127]:
user_fields = ['uid','username','join_date','num_post']
users = list(user_all.values())
writeToCSV(user_fields, users, 'user.csv')

In [1]:
import pandas as pd

In [2]:
d = pd.read_csv('posts_Spouse or Partner Caregiver Forum.csv')

In [3]:
d

Unnamed: 0,tid,uid,pid,p_content,is_start,p_date
0,2147553328,10052,148064200,\nALZConnected Moderator's Guidelines for Part...,True,"Tuesday, July 21, 2020 6:00 PM"
1,2147553333,10052,148064206,\nA message from the Alzheimer’s Association d...,True,"Tuesday, July 21, 2020 6:11 PM"
2,2147555830,2147696050,148087155,\nDH has been moody all day. He's bummed becau...,True,"Thursday, February 25, 2021 4:49 PM"
3,2147555830,2147669054,148087160,"\r\n\t\tLady, I am so sorry you have your hand...",False,"Thursday, February 25, 2021 6:09 PM"
4,2147555830,2147484453,148087168,"\nHi,\n I know how hard it is to take on all t...",False,"Thursday, February 25, 2021 7:49 PM"
...,...,...,...,...,...,...
191131,2147497277,2147497996,147588060,\nI have posted a couple times on some issues ...,True,"Thursday, May 2, 2013 6:15 PM"
191132,2147497276,2147497996,147588058,\nI have posted a couple times on some issues ...,True,"Thursday, May 2, 2013 6:09 PM"
191133,2147497274,2147497996,147588053,\nI posted on a couple things the past few day...,True,"Thursday, May 2, 2013 5:39 PM"
191134,2147494854,2147484236,147565262,\nHi TrynHard - I was reading your recipe for ...,True,"Friday, January 25, 2013 8:00 AM"
