In [1]:
import sqlite3
import pandas as pd

In [2]:
sqlite_file = 'data/database.sqlite'

In [3]:
def create_connection(db):
    """ connect to a sqlite database
    :param db: database file
    :return: a sqlite db connection object, 
      none if error
    """
    try:
        conn = sqlite3.connect(db)
        return conn
    except Error as e:
        print(e)
 
    return None

In [4]:
db = create_connection(sqlite_file)

In [5]:
# overview of table schema
display(pd.read_sql_query("PRAGMA table_info('May2015');", db))

# more info about what each field means:
# https://github.com/reddit-archive/reddit/wiki/JSON

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,created_utc,INTEGER,0,,0
1,1,ups,INTEGER,0,,0
2,2,subreddit_id,,0,,0
3,3,link_id,,0,,0
4,4,name,,0,,0
5,5,score_hidden,,0,,0
6,6,author_flair_css_class,,0,,0
7,7,author_flair_text,,0,,0
8,8,subreddit,,0,,0
9,9,id,,0,,0


In [6]:
# sample a few records

sample_rows = "SELECT \
                 subreddit, \
                 author, \
                 ups, \
                 downs, \
                 score, \
                 body \
               FROM \
                 May2015 \
               LIMIT 5;"

display(pd.read_sql_query(sample_rows, db))

Unnamed: 0,subreddit,author,ups,downs,score,body
0,soccer_jp,rx109,4,0,4,くそ\n読みたいが買ったら負けな気がする\n図書館に出ねーかな
1,nba,WyaOfWade,4,0,4,gg this one's over. off to watch the NFL draft...
2,politics,Wicked_Truth,0,0,0,Are you really implying we return to those tim...
3,AskReddit,jesse9o3,3,0,3,No one has a European accent either because i...
4,AskReddit,beltfedshooter,3,0,3,"That the kid ""..reminds me of Kevin."" so sad..."


In [7]:
# list the top subreddits

top_subreddits = "SELECT \
                    subreddit, \
                    COUNT(*) \
                  FROM \
                    May2015 \
                  GROUP BY 1 \
                  ORDER BY 2 DESC LIMIT 5;"

display(pd.read_sql_query(top_subreddits, db))

Unnamed: 0,subreddit,COUNT(*)
0,AskReddit,4234970
1,leagueoflegends,1223184
2,nba,756195
3,funny,745916
4,pics,630925


In [8]:
# sample full posts

# do not truncate column
pd.set_option('display.max_colwidth', -1)

body_samples = "SELECT \
                  body \
                FROM \
                  May2015 \
                LIMIT 5;"

display(pd.read_sql_query(body_samples, db))

# back to iPython default
pd.set_option('display.max_colwidth', 50)

Unnamed: 0,body
0,くそ\n読みたいが買ったら負けな気がする\n図書館に出ねーかな
1,gg this one's over. off to watch the NFL draft I guess
2,"Are you really implying we return to those times or anywhere near that political environment? If so, you won't have much luck selling the American people on that governance concept without ushering in American Revolution 2.0."
3,No one has a European accent either because it doesn't exist. There are accents from Europe but not a European accent.
4,"That the kid ""..reminds me of Kevin."" so sad :-("


In [9]:
# number of records based on selected constraints

number_of_records = "SELECT \
                       COUNT(*) \
                     FROM \
                       May2015;"

num_rec = pd.read_sql_query(number_of_records, db)

display(num_rec)

Unnamed: 0,COUNT(*)
0,54504410


In [10]:
# save a random sample as a csv

total_rows = num_rec.iloc[0,0]

target_rows = 100000

sample_threshold = target_rows/total_rows

sample_threshold


0.0018347139249833178

In [11]:
# verify counts

threshold_count = "SELECT \
                     COUNT(*) \
                   FROM \
                     May2015 \
                   WHERE \
                     ABS(CAST(RANDOM() AS REAL))/9223372036854775808 < {} \
                   ;".format(sample_threshold);

display(pd.read_sql_query(threshold_count, db))

Unnamed: 0,COUNT(*)
0,100519


In [12]:

data_for_csv = "SELECT \
                  subreddit, \
                  author, \
                  ups, \
                  downs, \
                  score, \
                  body, \
                  link_id, \
                  id, \
                  parent_id, \
                  name \
                FROM \
                  May2015 \
                WHERE \
                  ABS(CAST(RANDOM() AS REAL))/9223372036854775808 < {} \
                ;".format(sample_threshold)

data_for_csv_df = pd.read_sql(data_for_csv, db)

data_for_csv_df.to_csv("data/reddit_extracted_data.csv", index = False)

data_for_csv_df.head()

Unnamed: 0,subreddit,author,ups,downs,score,body,link_id,id,parent_id,name
0,ChivalryGame,DrFranknFurter,15,0,15,"I really don't get these posts. It's a game, w...",t3_34ghnt,cqug9da,t3_34ghnt,t1_cqug9da
1,AskReddit,Mbwillow1,3,0,3,Fart.,t3_34gmug,cqug9eq,t3_34gmug,t1_cqug9eq
2,TryingForABaby,Bob_Beran,1,0,1,There was something very shitty about the doct...,t3_34ay6r,cquga5j,t1_cqufibh,t1_cquga5j
3,AskReddit,Letha0al,1,0,1,"""What's in the dark chocolate Champagne truffl...",t3_34ffo5,cqugad6,t3_34ffo5,t1_cqugad6
4,nfl,maverickkk,1,0,1,SKOLLLLLL,t3_34dxa2,cqugaf7,t1_cqtxynn,t1_cqugaf7
