In [None]:
!pip install praw



In [None]:
import praw
import re
import json
import pandas as pd
from collections import Counter
import datetime

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
path = "/content/gdrive/My Drive/DSCI511/FinalProject/data/"


## Import API keys for Reddit's PRAW

In [None]:
def getPRAW():
  path = "/content/gdrive/My Drive/DSCI511/FinalProject/data/"
  with open(path + 'myKeys.json') as f:
      keys = json.load(f)

  access = praw.Reddit(client_id = keys['client_id'],
                      client_secret = keys['client_secret'],
                      user_agent = keys['user_agent'],
                      username = keys['username'] ,
                      password =  keys['password'])
  return access

# Getting Post links using PRAW and saving them 

In [None]:
def writeLinks(label,links):
  path = "/content/gdrive/My Drive/DSCI511/FinalProject/data-big/"
  with open(path + label+'_big_links.json','w') as f:
      json.dump(links,f)
  return 

In [None]:
YTAlinks_big = []
access = getPRAW()
for post in access.subreddit("AmITheAsshole").search(query = 'flair:"Asshole"',limit = 1000):
  YTAlinks_big.append(access.submission(id = post).url)
writeLinks('YTA',YTAlinks_big)

In [None]:
NTAlinks_big = []
access = getPRAW()
for post in access.subreddit("AmITheAsshole").search(query = 'flair:"Not the A-hole"',limit = 1000):
  NTAlinks_big.append(access.submission(id = post).url)
writeLinks('NTA',NTAlinks_big)

In [None]:
ESHlinks_big = []
access = getPRAW()
for post in access.subreddit("AmITheAsshole").search(query = 'flair:"Everyone Sucks"',limit = 1000):
  ESHlinks_big.append(access.submission(id = post).url)
writeLinks('ESH',ESHlinks_big)

In [None]:
NAHlinks_big = []
access = getPRAW()
for post in access.subreddit("AmITheAsshole").search(query = 'flair:"No A-holes here"',limit = 1000):
  NAHlinks_big.append(access.submission(id = post).url)
writeLinks('NAH',NAHlinks_big)

## Reading in these links 

In [None]:
def importLinks(label):
  path = "/content/gdrive/My Drive/DSCI511/FinalProject/data/"
  with open(path + label+'_links.json') as f:
      links = json.load(f)
  return links

In [None]:
YTAlinks = importLinks('YTA')
NTAlinks = importLinks('NTA')
NAHlinks = importLinks('NAH')
ESHlinks = importLinks('ESH')

## Create Dataframe 

In [None]:
def makeDataFrame(title,content,numUpvote,upvoteRatio,numComments,awards,links,dates):
  dataframe = pd.DataFrame()
  dataframe['Link'] = links
  dataframe['Title'] = title
  dataframe['Content'] = content
  dataframe['Date'] = dates
  dataframe['numComments'] = numComments
  dataframe['numUpvotes'] = numUpvote
  dataframe['upvoteRatio'] = upvoteRatio
  # dataframe["YTAcount"] = YTAcount
  # dataframe["NAHcount"] = NAHcount
  # dataframe["NTAcount"] = NTAcount
  # dataframe["ESHcount"] = ESHcount
  awardList = []
  for numPost in awards:
      for badge in awards[numPost]:
        awardList.append(badge)
  for award in awardList:
    dataframe[award] = [None]*len(dataframe)
  df = dataframe.copy()
  for postNum in awards:
    for postAward in awards[postNum]:
      df.loc[postNum, postAward] = awards[postNum][postAward]
  return df

## Getting Features for Posts

In [None]:
def countVotes(submission): #note: I am not using this code because it will take too long to analyze all the comments but I'm including it here so you can see that I tried to  
  YTAcount = 0
  NAHcount = 0
  NTAcount = 0
  ESHcount = 0
  submission.comments.replace_more(limit = 0) #only taking top level comments 
  for comment in submission.comments.list():
    if re.search(r'\bYTA\b', comment.body):
      YTAcount +=1
    elif re.search(r'\bNAH\b', comment.body):
      NAHcount +=1
    elif re.search(r'\bNTA\b', comment.body):
      NTAcount +=1
    elif re.search(r'\bESH\b', comment.body):
      ESHcount +=1
  return YTAcount,NAHcount,NTAcount,ESHcount


In [None]:
def getPostData(links):
  content = []
  title =[]
  awards = {}
  numPost = 0
  numComments = []
  upvoteRatio = []
  numUpvote = []
  dates = []
  # YTAcounts = []
  # NTAcounts = []
  # NAHcounts = []
  # ESHcounts = []
  for url in links:
    access = getPRAW()

    post = access.submission(url = url)
    dates.append(datetime.datetime.fromtimestamp(int(post.created_utc)).strftime('%Y-%m-%d %H:%M:%S'))

    # YTAcount,NAHcount,NTAcount,ESHcount = countVotes(post)

    # YTAcounts.append(YTAcount)
    # NTAcounts.append(NTAcount)  
    # NAHcounts.append(NAHcount)
    # ESHcounts.append(ESHcount)  
    title.append(post.title)
    numComments.append(post.num_comments)
    numUpvote.append(post.score)
    content.append(post.selftext)
    upvoteRatio.append(post.upvote_ratio)
    awards[numPost] = Counter()
    for award in post.all_awardings:
      awards[numPost][award['name']] = award['count']
    numPost+=1
  dataframe = makeDataFrame(title,content,numUpvote,upvoteRatio,numComments,awards,links,dates,YTAcounts,NAHcounts,NTAcounts,ESHcounts)
  return dataframe 

In [None]:
path = "/content/gdrive/My Drive/DSCI511/FinalProject/data-big/"

NTA_big = getPostData(NTAlinks_big)
NTA_big.to_csv(path+ "NTA.csv")


In [None]:
YTA_big = getPostData(YTAlinks_big)
YTA_big.to_csv(path+ "YTA.csv")

In [None]:
NAH_big = getPostData(NAHlinks_big)
NAH_big.to_csv(path+ "NAH.csv")

In [None]:
ESH_big = getPostData(ESHlinks_big)
ESH_big.to_csv(path+ "ESH.csv")