# Data Sourcing

Motivation: Understand the difference in how people approach dating at different age groups, based on advice posts in two subreddits focused on different age groups:
* [r/dating](https://www.reddit.com/r/dating)
* [r/datingoverthirty](https://www.reddit.com/r/datingoverthirty)

In [8]:
!pip install --upgrade praw
import praw
import pandas as pd
from prawcore import ResponseException
from credentials import API_KEY, API_SECRET, USERNAME_REDDIT, PASSWORD_REDDIT

Collecting praw
  Downloading praw-7.7.1-py3-none-any.whl (191 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m191.0/191.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: praw
  Attempting uninstall: praw
    Found existing installation: praw 7.7.0
    Uninstalling praw-7.7.0:
      Successfully uninstalled praw-7.7.0
Successfully installed praw-7.7.1


In [16]:
reddit = praw.Reddit(client_id=API_KEY,
    client_secret=API_SECRET ,
    user_agent='Post Description Extraction',
    username=USERNAME_REDDIT,
    password=PASSWORD_REDDIT
)

### Content Extraction

##### 2k Pull for EDA

In [17]:
#Extract 2000 recent submissions
subreddit_1 = reddit.subreddit('dating') #https://praw.readthedocs.io/en/stable/code_overview/reddit_instance.html
posts1 = subreddit_1.new(limit = 2000)

subreddit_2 = reddit.subreddit('datingoverthirty')
posts2 = subreddit_2.new(limit = 2000)

In [19]:
# Create a dataframe of all posts, including selftext, top_commment_text
data = []
ids = []

for post in posts1:
    if post.id not in ids:
        # urls.append([post.id])      
            
        data.append([post.subreddit, post.id, post.created_utc, post.title, post.selftext, get_top_comment(post.id)])
    else:
        continue
        
for post in posts2:
    if post.url not in urls:
        # urls.append([post.id])
        post.comment_sort = 'top'
        data.append([post.subreddit, post.id, post.created_utc, post.title, post.selftext, get_top_comment(post.id)])
    else:
        continue

posts_df = pd.DataFrame(data, columns = ['subreddit', 'id', 'created_utc', 'title', 'selftext', 'top_comment_text'])

KeyboardInterrupt: 

In [147]:
posts_df['subreddit'].value_counts()

datingoverthirty    998
dating              973
Name: subreddit, dtype: int64

In [None]:
posts_df['created_utc'] = pd.to_datetime(posts_df['created_utc'], unit = 's')

In [149]:
posts_df.head(5)

Unnamed: 0,subreddit,id,created_utc,title,selftext,top_comment_text
0,dating,1471ube,2023-06-11 18:49:33,Am I Clueless?,So there is this girl I’ve known my whole life...,
1,dating,1471t0w,2023-06-11 18:48:11,Is my Ex talking to the girl I like on purpose?,The title explains it all but here's a bit of ...,
2,dating,1471gsn,2023-06-11 18:35:06,Advice for Dealing with Anxious Attachment Sty...,I suffer from Anxious Attachment Style. It ve...,
3,dating,1471coz,2023-06-11 18:30:35,I want to send a letter to a guy I dated a few...,Ages at the time: me (26) him (29)\n\nA few ye...,
4,dating,14716mt,2023-06-11 18:23:59,A bit mopey,I (22m) have a date tonight (that I can't post...,


***Dates for Iteration***

In [153]:
posts_df['created_utc'][posts_df['subreddit'] == 'dating'].min()

Timestamp('2023-06-06 00:28:02')

In [154]:
posts_df['created_utc'][posts_df['subreddit'] == 'datingoverthirty'].min()

Timestamp('2022-11-01 20:50:32')

##### Top Comment Extraction

In [10]:
# Get the top comment each submission, sorted by upvote (as seen on the site)

def get_top_comment(submission_id):
    post = reddit.submission(id = submission_id)
    post.comment_sort = 'top' # cite: https://www.reddit.com/r/redditdev/comments/mzloap/extract_only_the_top_comment_for_a_submission/gw1h38f/
    top_comments = post.comments[0:5]

    #Some subreddits include automoderator sticky posts at the top of each thread (keep it civil, see the rules...).  Filtering these out
    non_mod_comments = [comment.body for comment in top_comments if comment.stickied == False]
    if len(non_mod_comments) ==0:
        top_comment = ''
    else: 
        top_comment = non_mod_comments[0]
    return top_comment

In [11]:
def combine_data(posts, label):
    data = []
    for post in posts:
        data.append([post.subreddit, post.id, post.created_utc, post.title, post.selftext, get_top_comment(post.id)])
    print(f"{label.upper()} POSTS: {len(data)}")
    return data

> Note: I was able to pull ~1000 posts, including title, selftext, and the text of the top voted comment for each subreddit prior to r/datingoverthirty going dark.  I have requested access, but have not been able to secure an invite to the private community.  The 1000 posts are used in the investigation and modeling, so as to not introduce unbalanced classes (r/dating is still public)

In [None]:
subreddits = ['dating'
              , 'datingoverthirty'
             ]
for subreddit in subreddits:
    postsnew = reddit.subreddit(subreddit).new(limit = 1000)
    postshot = reddit.subreddit(subreddit).hot(limit = 1000)
    poststop = reddit.subreddit(subreddit).top(limit = 1000)
    postscon = reddit.subreddit(subreddit).controversial(limit = 1000)
    
    datanew = combine_data(postsnew, 'NEW')
    datahot = combine_data(postshot, 'HOT')
    datatop = combine_data(poststop, 'TOP')
    datacon = combine_data(postscon, 'CON')

TooManyRequests: received 429 HTTP response

In [164]:
df = pd.DataFrame(datanew + datahot + datatop + datacon, columns = ['subreddit', 'id', 'created_utc', 'title', 'selftext', 'top_comment_text'])
df.drop_duplicates(subset = 'id', inplace = True)

In [165]:
df.shape

(2935, 6)

In [166]:
df['created_utc'] = pd.to_datetime(df['created_utc'], unit = 's')

In [167]:
df['subreddit'].value_counts()

dating    2935
Name: subreddit, dtype: int64

In [168]:
from datetime import datetime
now = datetime.now().strftime("%Y-%m-%d %H:%M")
df.to_csv(f'data/reddit_posts_raw_all_filters{now}.csv', index = False)