# Data Extraction
## Natural Language Processing Reddit Project
### Zach Tretter, April 2020
General Assembly, Data Science Immersion Cohort 11, Boston

## Table of Contents
- [Imports](#Imports)
- [Customize The Reddit Pull](#Customize-The-Reddit-Pull)
- [Data Generating Functions](#Data-Generating-Functions)
- [Subreddit Data Pull](#Subreddit-Data-Pull)
- [CSV Export](#CSV-Export)
- [All Possible Submission Fields](#All-Possible-Submission-Fields)

## Imports

In [2]:
import requests
import pandas as pd
import datetime as dt
import time

## Customize The Reddit Pull

In [36]:
# Declare your Subreddits of Interest
class0_subreddit = 'Conservative'
class1_subreddit = 'Libertarian'
how_many = 100_000

In [26]:
# A consistent starting date for our data pull to work backwards from
noon_18Apr2020 = 1587225600

In [27]:
# These are the fields we're looking at
features = ['author', 'created_utc', 'id', 'num_comments', 'score', 'selftext', 'spoiler', 'subreddit', 'title']

In [28]:
# This is how we'll order our data in the final dataframe
column_order = [
    'class', # We create this in our function
    'subreddit',
    'title',
    'selftext',
    'date', # We create this from the UTC
    'score',
    'num_comments',
    'author',
    'spoiler',
    'id',
    'datetime', # We create this from the UTC
    'created_utc'  
]

## Data Generating Functions

In [29]:
# Generate Url to make our request
def create_request_submissions(subreddit,before):
    url_submissions = "https://api.pushshift.io/reddit/search/submission"
    params = {
        'subreddit': subreddit,
        'size':1000,
        'before':before
    }
    return requests.get(url_submissions, params)

# Generate Dataframe with specified fields
def create_dataframe(response,fields):
    data = pd.DataFrame(response.json()['data'])
    return data.loc[:,fields]

# Generate new columns to show the Date and Datetime
def create_date_columns(dataframe):
    dataframe['date']=dataframe['created_utc'].apply(dt.date.fromtimestamp)
    dataframe['datetime']=dataframe['created_utc'].apply(dt.datetime.fromtimestamp)
    return dataframe

# Combine the Create Request and Create Dataframe into a single function
def pull_data(subreddit,fields,before):
    request = create_request_submissions(subreddit,before)
    data = create_dataframe(request,fields)
    return create_date_columns(data)

#return the minimum utc in a dataframe
def earliest_utc(dataframe):
    return dataframe['created_utc'].min()

# Pull submissions from a subreddit and make a dataframe
def mass_pull(subreddit, fields, starting_epoch, volume):
    starting_time = time.time()
    from_before = starting_epoch
    loop_length = volume // 1000
    master_data = pd.DataFrame()
    
    for i in range(loop_length):
        print(f"Pulled {(i+1)*1000} submissions of {volume} from r/{subreddit} from before {dt.date.fromtimestamp(from_before)}")
        data = pull_data(subreddit, fields, from_before)
        from_before = earliest_utc(data)
        master_data = master_data.append(data,ignore_index=True)
        time.sleep(1) # Pause so as not to trip the api limit
    print(f'This mass pull of {master_data.shape[0]} {subreddit} rows took {time.time()-starting_time} seconds')
    print('\n')
    return master_data
     
    
# Pull submissions from 2 subreddits and make a combined dataframe  
def generate_master_dataframe(subreddit1, subreddit2, fields, dataframe_column_order, starting_epoch, volume):
    starting_time = time.time()
    dataframe1 = mass_pull(subreddit1, fields, starting_epoch, volume)
    dataframe2 = mass_pull(subreddit2, fields, starting_epoch, volume)
    output = dataframe1.append(dataframe2,ignore_index=True)
    output['class'] = output['subreddit'].map({class0_subreddit:0,class1_subreddit:1})
    output = output[dataframe_column_order]
    print(f'This function took {time.time()-starting_time} seconds to run')
    return output

## Subreddit Data Pull

In [30]:
# Generate Dataframe
dataframe = generate_master_dataframe(subreddit1 = class0_subreddit,
                                      subreddit2 = class1_subreddit,
                                      fields = features,
                                      dataframe_column_order = column_order,
                                      starting_epoch = noon_18Apr2020,
                                      volume = how_many)

Pulled 1000 submissions of 100000 from r/conservative from before 2020-04-18
Pulled 2000 submissions of 100000 from r/conservative from before 2020-04-16
Pulled 3000 submissions of 100000 from r/conservative from before 2020-04-13
Pulled 4000 submissions of 100000 from r/conservative from before 2020-04-11
Pulled 5000 submissions of 100000 from r/conservative from before 2020-04-08
Pulled 6000 submissions of 100000 from r/conservative from before 2020-04-06
Pulled 7000 submissions of 100000 from r/conservative from before 2020-04-03
Pulled 8000 submissions of 100000 from r/conservative from before 2020-04-01
Pulled 9000 submissions of 100000 from r/conservative from before 2020-03-29
Pulled 10000 submissions of 100000 from r/conservative from before 2020-03-27
Pulled 11000 submissions of 100000 from r/conservative from before 2020-03-24
Pulled 12000 submissions of 100000 from r/conservative from before 2020-03-21
Pulled 13000 submissions of 100000 from r/conservative from before 2020-0

Pulled 6000 submissions of 100000 from r/libertarian from before 2020-03-16
Pulled 7000 submissions of 100000 from r/libertarian from before 2020-03-09
Pulled 8000 submissions of 100000 from r/libertarian from before 2020-03-02
Pulled 9000 submissions of 100000 from r/libertarian from before 2020-02-25
Pulled 10000 submissions of 100000 from r/libertarian from before 2020-02-18
Pulled 11000 submissions of 100000 from r/libertarian from before 2020-02-11
Pulled 12000 submissions of 100000 from r/libertarian from before 2020-02-05
Pulled 13000 submissions of 100000 from r/libertarian from before 2020-01-30
Pulled 14000 submissions of 100000 from r/libertarian from before 2020-01-22
Pulled 15000 submissions of 100000 from r/libertarian from before 2020-01-14
Pulled 16000 submissions of 100000 from r/libertarian from before 2020-01-06
Pulled 17000 submissions of 100000 from r/libertarian from before 2019-12-30
Pulled 18000 submissions of 100000 from r/libertarian from before 2019-12-19
Pul

## CSV Export

In [39]:
# Export to CSV
dataframe.to_csv("DATA/subreddit_data.csv")

In [37]:
# Ensure class field is created appropriately
dataframe['class'] = dataframe['subreddit'].map({class0_subreddit:0,
                                                 class1_subreddit:1})

In [1]:
# View your Dataframe
dataframe

NameError: name 'dataframe' is not defined

## All Possible Submission Fields

In [9]:

fields_submissions = [
#     'all_awardings',
    'author',
#     'author_flair_css_class',
#     'author_flair_richtext',
#     'author_flair_text',
#     'author_flair_type',
#     'author_fullname',
#     'author_patreon_flair',
#     'can_mod_post',
#     'contest_mode',
    'created_utc',
#     'domain',
#     'full_link',
#     'gilded',
#     'gildings',
    'id',
#     'is_crosspostable',
#     'is_meta',
#     'is_original_content',
#     'is_reddit_media_domain',
#     'is_robot_indexable',
#     'is_self',
#     'is_video',
#     'link_flair_background_color',
#     'link_flair_css_class',
#     'link_flair_richtext',
#     'link_flair_template_id',
#     'link_flair_text',
#     'link_flair_text_color',
#     'link_flair_type',
#     'locked',
#     'media_only',
#     'no_follow',
    'num_comments',
#     'num_crossposts',
#     'over_18',
#     'parent_whitelist_status',
#     'permalink',
#     'pinned',
#     'pwls',
#     'retrieved_on',
    'score',
    'selftext',
#     'send_replies',
    'spoiler',
#     'stickied',
    'subreddit',
#     'subreddit_id',
#     'subreddit_subscribers',
#     'subreddit_type',
#     'thumbnail',
    'title',
#     'total_awards_received',
#     'updated_utc',
#     'url',
#     'whitelist_status',
#     'wls',
#     'author_flair_background_color',
#     'author_flair_text_color',
#     'post_hint',
#     'preview',
#     'thumbnail_height',
#     'thumbnail_width',
#     'brand_safe',
#     'distinguished',
#     'author_created_utc',
#     'media',
#     'media_embed',
#     'secure_media',
#     'secure_media_embed',
#     'suggested_sort',
#     'rte_mode',
#     'approved_at_utc',
#     'banned_at_utc',
#     'author_flair_template_id',
#     'allow_live_comments',
#     'event_end',
#     'event_is_live',
#     'event_start',
#     'edited',
#     'view_count',
#     'author_cakeday',
#     'steward_reports',
#     'collections',
#     'awarders',
#     'og_description',
#     'og_title',
#     'media_metadata',
#     'mod_reports',
#     'user_reports'
]

fields_submissions

['author',
 'created_utc',
 'id',
 'num_comments',
 'score',
 'selftext',
 'spoiler',
 'subreddit',
 'title']