In [1]:
import sqlite3
import pandas as pd

In [2]:
sqlite_file = 'data/database.sqlite'

In [17]:
def create_connection(db):
    """ connect to a sqlite database
    :param db: database file
    :return: a sqlite db connection object, 
      none if error
    """
    try:
        conn = sqlite3.connect(db)
        return conn
    except Error as e:
        print(e)
 
    return None

In [15]:
db = create_connection(sqlite_file)

In [30]:
# overview of table schema
display(pd.read_sql_query("PRAGMA table_info('May2015');", db))

# more info about what each field means:
# https://github.com/reddit-archive/reddit/wiki/JSON

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,created_utc,INTEGER,0,,0
1,1,ups,INTEGER,0,,0
2,2,subreddit_id,,0,,0
3,3,link_id,,0,,0
4,4,name,,0,,0
5,5,score_hidden,,0,,0
6,6,author_flair_css_class,,0,,0
7,7,author_flair_text,,0,,0
8,8,subreddit,,0,,0
9,9,id,,0,,0


In [31]:
sample_rows = "SELECT \
                 subreddit, \
                 author, \
                 ups, \
                 downs, \
                 score, \
                 body \
               FROM \
                 May2015 \
               LIMIT 5;"

display(pd.read_sql_query(sample_rows, db))

Unnamed: 0,subreddit,author,ups,downs,score,body
0,soccer_jp,rx109,4,0,4,くそ\n読みたいが買ったら負けな気がする\n図書館に出ねーかな
1,nba,WyaOfWade,4,0,4,gg this one's over. off to watch the NFL draft...
2,politics,Wicked_Truth,0,0,0,Are you really implying we return to those tim...
3,AskReddit,jesse9o3,3,0,3,No one has a European accent either because i...
4,AskReddit,beltfedshooter,3,0,3,"That the kid ""..reminds me of Kevin."" so sad..."


In [28]:
top_subreddits = "SELECT \
                    subreddit, \
                    COUNT(*) \
                  FROM \
                    May2015 \
                  GROUP BY 1 \
                  ORDER BY 2 DESC LIMIT 5;"

display(pd.read_sql_query(top_subreddits, db))

Unnamed: 0,subreddit,COUNT(*)
0,AskReddit,4234970
1,leagueoflegends,1223184
2,nba,756195
3,funny,745916
4,pics,630925


In [38]:
top_subreddits = "SELECT \
                    * \
                  FROM \
                    May2015 \
                  LIMIT 5;"

display(pd.read_sql_query(top_subreddits, db))

Unnamed: 0,created_utc,ups,subreddit_id,link_id,name,score_hidden,author_flair_css_class,author_flair_text,subreddit,id,...,downs,archived,author,score,retrieved_on,body,distinguished,edited,controversiality,parent_id
0,1430438400,4,t5_378oi,t3_34di91,t1_cqug90g,0,,,soccer_jp,cqug90g,...,0,0,rx109,4,1432703079,くそ\n読みたいが買ったら負けな気がする\n図書館に出ねーかな,,0,0,t3_34di91
1,1430438400,4,t5_2qo4s,t3_34g8mx,t1_cqug90h,0,Heat,Heat,nba,cqug90h,...,0,0,WyaOfWade,4,1432703079,gg this one's over. off to watch the NFL draft...,,0,0,t3_34g8mx
2,1430438400,0,t5_2cneq,t3_34f7mc,t1_cqug90i,0,,,politics,cqug90i,...,0,0,Wicked_Truth,0,1432703079,Are you really implying we return to those tim...,,0,0,t1_cqufim0
3,1430438400,3,t5_2qh1i,t3_34f9rh,t1_cqug90j,0,,,AskReddit,cqug90j,...,0,0,jesse9o3,3,1432703079,No one has a European accent either because i...,,0,0,t1_cqug2sr
4,1430438400,3,t5_2qh1i,t3_34fvry,t1_cqug90k,0,,,AskReddit,cqug90k,...,0,0,beltfedshooter,3,1432703079,"That the kid ""..reminds me of Kevin."" so sad...",,0,0,t3_34fvry


In [37]:
number_of_records = "SELECT \
                       COUNT(*) \
                     FROM \
                       May2015;"
display(pd.read_sql_query(number_of_records, db))


Unnamed: 0,COUNT(*)
0,54504410


In [33]:
# do not truncate column
pd.set_option('display.max_colwidth', -1)

body_samples = "SELECT \
                  body \
                FROM \
                  May2015 \
                LIMIT 5;"

display(pd.read_sql_query(body_samples, db))

# back to iPython default
pd.set_option('display.max_colwidth', 50)

Unnamed: 0,body
0,くそ\n読みたいが買ったら負けな気がする\n図書館に出ねーかな
1,gg this one's over. off to watch the NFL draft I guess
2,"Are you really implying we return to those times or anywhere near that political environment? If so, you won't have much luck selling the American people on that governance concept without ushering in American Revolution 2.0."
3,No one has a European accent either because it doesn't exist. There are accents from Europe but not a European accent.
4,"That the kid ""..reminds me of Kevin."" so sad :-("


In [40]:
# attempt so save the entire thing as a .csv

data_for_csv = "SELECT \
                  subreddit, \
                  author, \
                  ups, \
                  downs, \
                  score, \
                  body, \
                  link_id, \
                  id, \
                  parent_id, \
                  name \
                FROM \
                  May2015 \
                WHERE \
                  LENGTH(body) > 25";

#data_for_csv_df = pd.read_sql(data_for_csv, db)

#data_for_csv_df.to_csv("extracted_data, index = False")
#data_for_csv_df.head()

Unnamed: 0,subreddit,author,ups,downs,score,body,link_id,id,parent_id,name
0,soccer_jp,rx109,4,0,4,くそ\n読みたいが買ったら負けな気がする\n図書館に出ねーかな,t3_34di91,cqug90g,t3_34di91,t1_cqug90g
1,nba,WyaOfWade,4,0,4,gg this one's over. off to watch the NFL draft...,t3_34g8mx,cqug90h,t3_34g8mx,t1_cqug90h
2,politics,Wicked_Truth,0,0,0,Are you really implying we return to those tim...,t3_34f7mc,cqug90i,t1_cqufim0,t1_cqug90i
3,AskReddit,jesse9o3,3,0,3,No one has a European accent either because i...,t3_34f9rh,cqug90j,t1_cqug2sr,t1_cqug90j
4,AskReddit,beltfedshooter,3,0,3,"That the kid ""..reminds me of Kevin."" so sad...",t3_34fvry,cqug90k,t3_34fvry,t1_cqug90k
