### Collect Reddit Data - ANLY 590 Project
#### November 13, 2018

***

#### Prep

In [1]:
# set working directory
import os
import sys
path = '/Users/kgedney/Documents/georgetown/anly590/author-id-project'
os.chdir(path)

In [2]:
# add API info
client_id     = 'my client id'
client_secret = 'my client secret'
user_agent    = 'deeplearning'
username      = 'my user name'
password      = ''

In [3]:
# import packages
import praw
import pandas as pd
import datetime as dt
from tqdm import tqdm

In [4]:
# set reddit credentials
reddit = praw.Reddit(client_id     = client_id, \
                     client_secret = client_secret, \
                     user_agent    = user_agent, \
                     username      = username, \
                     password      = password)

***

####  Scrape Comments

In [13]:
# import usernames
midterms = pd.read_csv('Midterms.csv')
midterms.head()
midterms = midterms.iloc[1:]
print(len(midterms['author'].unique()))

421


In [6]:
# intialize dictionary
usernames     = midterms['author'].unique()
comments_dict = {'author': [], 
                 'body': []}

In [7]:
# run scrape
for username in tqdm(usernames):
    try:
        for comment in reddit.redditor(username).comments.new(limit=None):
            comments_dict['author'].append(username)
            comments_dict['body'].append(comment.body.split('\n', 1))
    except KeyboardInterrupt:
        raise
    except:
        print('error at %s' % username, file=sys.stderr) 

 16%|█▋        | 69/422 [14:49<1:05:55, 11.20s/it]error at nan
100%|██████████| 422/422 [1:28:20<00:00, 12.18s/it]


In [23]:
# create df
df = pd.DataFrame(comments_dict)
df.head()

(353091, 2)


Unnamed: 0,author,body
0,AutoModerator,[Your submission has been removed from /r/sams...
1,AutoModerator,"[, As a reminder, this subreddit [is for civil..."
2,AutoModerator,[Your post have been removed as it appears tha...
3,AutoModerator,[Your post was removed because it contained a ...
4,AutoModerator,[Your submission has been removed. Stories in ...


In [37]:
# remove AutoModerator posts
df = df[df['author'] != 'AutoModerator'].reset_index(drop=True)
print(df.shape)
df.head()

(352096, 2)


Unnamed: 0,author,body
0,BlakeIsBlake,[I just went through this process myself. What...
1,BlakeIsBlake,[Dude. No.]
2,BlakeIsBlake,[I was still skeptical even after reading your...
3,BlakeIsBlake,"[> NYU School of Continuing Education, \nYes. ..."
4,BlakeIsBlake,[Still selling? Interested]


In [25]:
# save as csv
df.to_csv('raw_data.csv')

***

#### Exploratory Analysis

In [134]:
# read in data
df = pd.read_csv('raw_data.csv')

In [9]:
# average of 838 posts per author
df.groupby('author').count().mean()

Unnamed: 0    838.32381
body          838.32381
dtype: float64

In [10]:
# 420 unique authors
len(set(df['author']))

420

In [195]:
import string
exclude = set(string.punctuation)

In [179]:
# combine back lists
df['whole_body'] = df.body.apply(lambda x: ''.join(x))
#df['whole_body'] = df.body.apply(lambda x: ''.join(ch for ch in x if ch not in remove))

In [158]:
# create new columns for word count and character count
df['num_words'] = df.whole_body.apply(lambda x: len(x.split(' ')))
df['num_chars'] = df.whole_body.apply(lambda x: len(x))

In [139]:
# average words (just on a basic tokenization)
df['num_words'].mean()

43.83390041352359

In [140]:
# average characters
df['num_chars'].mean()

257.90508554485143

In [141]:
df['num_chars'].max()

10105

In [142]:
df['num_words'].max()

2048

Check to see if there are any emojis in the comments. Adding a count 1 for every comment that has an emoji, and 0 if there are no emojis. The sum of the list which is equal to zero, we conclude that there are no emojis in the comments.

In [143]:
import emoji
check_emojis = []
for character in df['body']:
    if character in emoji.UNICODE_EMOJI:
        check_emojis.append(1)
    else:
        check_emojis.append(0)
sum(check_emojis)

0

In [144]:
df.describe()

Unnamed: 0.1,Unnamed: 0,num_words,num_chars
count,352096.0,352096.0,352096.0
mean,176047.5,43.8339,257.905086
std,101641.504528,65.563135,397.985741
min,0.0,1.0,4.0
25%,88023.75,11.0,66.0
50%,176047.5,24.0,139.0
75%,264071.25,51.0,294.0
max,352095.0,2048.0,10105.0


In [145]:
df[df['num_words'] == df['num_words'].max()]

Unnamed: 0.1,Unnamed: 0,author,body,whole_body,num_words,num_chars
61955,61955,Bashfluff,['Ajit Pai Ajit Pai Ajit Pai Ajit Pai Ajit Pai...,['Ajit Pai Ajit Pai Ajit Pai Ajit Pai Ajit Pai...,2048,9219


In [146]:
df[:1]

Unnamed: 0.1,Unnamed: 0,author,body,whole_body,num_words,num_chars
0,0,BlakeIsBlake,['I just went through this process myself. Wha...,['I just went through this process myself. Wha...,79,508


In [173]:
df['whole_body'][18]

'\'1. What is your credit score? *765*\', \'2. What cards do you currently have or have you had in the past (including closed cards), along with dates of when you were approved for the cards? Please include month and year for any card approved in the last 3 years.nn* *Discover Card (Categories) 914*n* *Citi DoubleCash 116*n* *Chase Freedom 716*n* *Uber Visa 1117*n* *Chase Sapphire Reserve 518*n* *Amazon Prime Rewards 818*n* *World of Hyatt 1018*nnn3. How much natural spend can you put on a new card(s) in 3 months? *$6k to $9k*nn4. Are you willing to MS, and if so, how much in 3 months? See this page for a primer on MS. Plastiq (for rentmortgageloan payments) and bank account funding are often good options for beginners.n*I\'ve never tried it, but would be open to it.*nn5. Are you open to applying for business cards? If not, why? See this post and this wiki question to learn more.n*Not really. Don\'t have a business, not down to attempt to fabricate one*nn6. How many new cards are you i

***

### Old Stuff

In [None]:
subreddit = (reddit.subreddit('technology'))

In [None]:
for submission in subreddit(limit=None):
    print(submission.title, submission.id)

In [74]:
# print comments
comments_dictionary = {}
i = 0

for comment in tqdm((reddit.subreddit('technology').comments(limit=None))):
    comments_dictionary[i] = (comment.body, comment.author, comment.author_fullname)
    i = i + 1
    #print(comment.body, comment.author, comment.author_fullname)

966it [00:12, 77.59it/s]


In [73]:
len(comments_dictionary)

967

In [66]:
# vars(comment)