In [0]:
#Importing libraries
import requests
import time
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords

### Sub-reddits to scrape (Loans & Credit Cards)


In [0]:
#Link to api for "Loan" related posts
url_ln = 'https://www.reddit.com/r/Loans/.json'
#Link to api for "Credit Card" related posts
url_cc = 'https://www.reddit.com/r/CreditCards/.json'

###Basic methodology used to webscrape reddit

In [0]:
#Creating a user agent
header_ua = {'User-agent': 'reddit-reader-bot-0.2'}
#Using requests to get respones from the websites
res_ln = requests.get(url_ln, headers = header_ua)
res_cc = requests.get(url_cc, headers = header_ua)

In [0]:
#Checking status for posts reponses, 200 indicates everything is wel
print(res_ln.status_code)
print(res_cc.status_code)

200
200


In [0]:
#Loan Posts Data
ln_data = res_ln.json()
#Credit card Data
cc_data = res_cc.json()

###Limitations of this method:

*   The scraping only returns 25 reddit posts in one run
*   More posts are necesscary for building an effective machine learning classification model

###Overcoming these Limitations:

*   Understand the structure of reddits web-page
*   Each page of the sub-reddit generally has 25 reddit posts
*   So to achieve around 1000 posts 40 pages have to be scraped
*   Identify where posts reside within the .json structure


In [0]:
#Collecting posts for Loans

#Empty list which will contain posts
posts_ln = []

#After helps us read-in posts sequentially(learned from the post structure)
after = None

#Staring a loop to try and scrape through 50 pages 
for i in range(50):
    if i % 5 == 0:
        print(i, 'pages scraped')
    if after == None:
        params = {}
    else:
        params = {'after' : after}
    url_ln = 'https://www.reddit.com/r/Loans/.json'
    
    res_ln = requests.get(url_ln, headers = header_ua, params = params)
    
    #Populating posts
    if res_ln.status_code == 200:
        ln_data = res_ln.json()
        posts_ln.extend(ln_data['data']['children'])
        after = ln_data['data']['after']
    else:
        print(res_ln.status_code)
    time.sleep(1)

0 pages scraped
5 pages scraped
10 pages scraped
15 pages scraped
20 pages scraped
25 pages scraped
30 pages scraped
35 pages scraped
40 pages scraped
45 pages scraped


In [0]:
#Similar Methodology followed for Credit card posts

#Empty list which will contain posts
posts_cc = []

#After helps us read-in posts sequentially(learned from the post structure)
after = None

#Staring a loop to try and scrape through 50 pages 
for i in range(50):
    if i % 5 == 0:
        print(i, 'pages scraped')
    if after == None:
        params = {}
    else:
        params = {'after' : after}
    url_cc = 'https://www.reddit.com/r/CreditCards/.json'
    
    res_cc = requests.get(url_cc, headers = header_ua, params = params)
    
    #Populating posts
    if res_cc.status_code == 200:
        cc_data = res_cc.json()
        posts_cc.extend(ln_data['data']['children'])
        after = cc_data['data']['after']
    else:
        print(res_cc.status_code)
    time.sleep(1)

0 pages scraped
5 pages scraped
10 pages scraped
15 pages scraped
20 pages scraped
25 pages scraped
30 pages scraped
35 pages scraped
40 pages scraped
45 pages scraped


In [0]:
#Summary of our data collection
print(f'The number of Loan posts acquired are {len(posts_ln)}')
print(f'The number of Credit Card posts acquired are {len(posts_cc)}')

The number of Loan posts acquired are 1243
The number of Credit Card posts acquired are 1250


In [0]:
#Making a dataframe of Credit Card posts
posts_frame_cc = []
for post in posts_cc:
    dict_cc = {}
    if (post['data']['title'] != '') and (post['data']['selftext'] != ''):
        dict_cc['title'] = post['data']['title']
        dict_cc['text'] = post['data']['selftext']
        dict_cc['class'] = 'cc'
        posts_frame_cc.append(dict_cc)
    else:
        pass

cc_frame = pd.DataFrame(posts_frame_cc, columns = ['title', 'text', 'class'])
cc_frame.head()

Unnamed: 0,title,text,class
0,[REQ] $300 Loan until Oct 15 - will repay $330,I was doored by a taxi and was severely injure...,cc
1,[PAID] triniwarrior - $100 + interest,"Great transaction, no reminder needed!\n\nhttp...",cc
2,[PAID] BestDLine - $50 + interest,"Great transaction as always, no reminder neede...",cc
3,[PAID] bdubble $25 + Interest,http://www.reddit.com/r/Loans/comments/2fd6gg/...,cc
4,[REQ] I need £0.15/$0.26!,So recently I purchased something on eBay that...,cc


In [0]:
#Making a dataframe of Loan posts
posts_frame_ln = []
for post in posts_ln:
    dict_ln = {}
    if (post['data']['title'] != '') and (post['data']['selftext'] != ''):
        dict_ln['title'] = post['data']['title']
        dict_ln['text'] = post['data']['selftext']
        dict_ln['class'] = 'ln'
        posts_frame_ln.append(dict_ln)
    else:
        pass

ln_frame = pd.DataFrame(posts_frame_ln, columns = ['title', 'text', 'class'])
ln_frame.head()

Unnamed: 0,title,text,class
0,[META] Kiva Microloan,"Hello redditors, \n\nMy name is Wally and I la...",ln
1,[REQ] small loan,I know I've never posted here before. I have ...,ln
2,"[REQ] First timer, would appreciate a loan of ...","Just moved, had a minor problem with the serpe...",ln
3,[Paid] /u/penguin_dust payment,Updating the record here.,ln
4,[REQ] 220 to be paid back 250 3/14,Check was short of rent this week and I just n...,ln


In [0]:
#Creating a final dataframe consisting of both frames
final_df = pd.concat([cc_frame, ln_frame], ignore_index = True)
#Importing the data to csv
final_df.to_csv('reddit_post_combined.csv')
cc_frame.to_csv('credit_card_posts.csv')
ln_frame.to_csv('loan_posts.csv')