In [1]:
import os
PROJ_ROOT = os.path.abspath(os.path.join(os.pardir))
print(PROJ_ROOT)

/home/bruno/Desktop/coarse-discourse-validation


In [2]:
import pandas as pd
import json

In [3]:
filepath = os.path.join(PROJ_ROOT, 'data', 'raw', 'coarse_discourse_dump_reddit.json')

raw_df = pd.read_json(filepath, lines=True) # 'lines' so pandas reads each line of file as a json object

# Converts 'is_self_post' column to boolean
raw_df['is_self_post'] = raw_df['is_self_post'].fillna(0).astype('bool')

raw_df[raw_df['is_self_post']!= True].head()

raw_df.head()

Unnamed: 0,is_self_post,posts,subreddit,title,url
0,True,"[{'id': 't3_1bx6qw', 'annotations': [{'link_to...",100movies365days,DTX120: #87 - Nashville,https://www.reddit.com/r/100movies365days/comm...
1,True,"[{'id': 't3_omv7p', 'annotations': [{'link_to_...",100sets,"Male, 23 years old. Going for 100 sets!",https://www.reddit.com/r/100sets/comments/omv7...
2,True,"[{'id': 't3_259tbh', 'annotations': [{'link_to...",1200isplenty,122cal black currant cheesecake!,https://www.reddit.com/r/1200isplenty/comments...
3,True,"[{'id': 't3_16h61h', 'annotations': [{'link_to...",1911,Need help finding a Springfield!,https://www.reddit.com/r/1911/comments/16h61h/...
4,True,"[{'id': 't3_35igzp', 'annotations': [{'link_to...",1911,Help with a possible trade?,https://www.reddit.com/r/1911/comments/35igzp/...


# Data preprocessing

To train our models, we want every post as a line in the final dataset. Note that we keep the features (but the url) of the thread so we don't discard any data for now.

In [52]:
thread = raw_df.iloc[45]

thread.posts[0]

{'id': 't3_4491q5',
 'annotations': [{'link_to_post': 'none',
   'main_type': 'question',
   'annotator': 'f8484f2bc9b23a4f9dc9f441a85d5747'},
  {'link_to_post': 'none',
   'main_type': 'question',
   'annotator': '38d643ea0762278cc21958fa2363041d'},
  {'link_to_post': 'none',
   'main_type': 'question',
   'annotator': '887a4849ca16b52391353d094ca5aae2'}],
 'majority_link': 'none',
 'majority_type': 'question',
 'is_first_post': True,
 'body': '[deleted]',
 'url': 'https://www.reddit.com/r/3DS/comments/4491q5/is_there_anyway_to_make_the_3ds_not_new_3ds/'}

In [15]:
for post in thread.posts:
    print(post['id']+' - ', end='')
    try:
        print(post['in_reply_to'])
    except KeyError:
        print('root')

t3_1riref - root
t1_cdo6ovs - t3_1riref
t1_cdo7orh - t1_cdo6ovs
t1_cdo7t4w - t1_cdo7orh
t1_cdo7v6s - t1_cdo7t4w
t1_cdo8mp8 - t3_1riref


In [36]:
import anytree

dposts = pd.DataFrame(thread.posts)

tree = {}

for i in range(0,len(dposts)):
    row = dposts.iloc[i]
    try:
        tree[row['id']] = anytree.Node(row['id'], parent=tree[row['in_reply_to']])
    except KeyError:
        tree[row['id']] = anytree.Node(row['id'])
        root = tree[row['id']]
print(anytree.RenderTree(root))

Node('/t3_1riref')
├── Node('/t3_1riref/t1_cdo6ovs')
│   └── Node('/t3_1riref/t1_cdo6ovs/t1_cdo7orh')
│       └── Node('/t3_1riref/t1_cdo6ovs/t1_cdo7orh/t1_cdo7t4w')
│           └── Node('/t3_1riref/t1_cdo6ovs/t1_cdo7orh/t1_cdo7t4w/t1_cdo7v6s')
└── Node('/t3_1riref/t1_cdo8mp8')


In [54]:
dpproc = pd.DataFrame()

for i in range(0, len(raw_df)):
    thread = raw_df.iloc[i]
    dposts = pd.DataFrame(thread['posts'])
    dauthor = dposts[dposts['is_first_post']==True]
    if(len(dauthor)!=1):
        print("Eita, tem {} first post's".format(len(dauthor)))
    else:
        try:
            dposts['thread_author'] = dauthor.iloc[0]['author']
        except:
            print(dauthor.iloc[0])
            dposts['thread_author'] = '[deleted author]'
    
    dposts['is_self_post'] = thread['is_self_post']
    dposts['subreddit'] = thread['subreddit']
    dposts['thread_title'] = thread['title']
    
    # Feature generation
    dposts['comments_number'] = len(thread['posts'])
        # tree generation for comments branching features
    tree = {}
    for i in range(0,len(dposts)):
        row = dposts.iloc[i]
        try:
            tree[row['id']] = anytree.Node(row['id'], parent=tree[row['in_reply_to']])
        except KeyError:
            tree[row['id']] = anytree.Node(row['id'])
            root = tree[row['id']]
    dposts['branches_number'] = len(root.leaves)
    dposts['average_branch_length'] = sum([leaf.depth for leaf in root.leaves])/len(root.leaves)

    dpproc = pd.concat([dpproc, dposts], ignore_index=True, sort=True)

dpproc.head()

annotations      [{'link_to_post': 'none', 'main_type': 'questi...
id                                                       t3_3as2wh
in_reply_to                                                    NaN
is_first_post                                                 True
majority_link                                                 none
majority_type                                             question
post_depth                                                     NaN
Name: 0, dtype: object
annotations      [{'link_to_post': 'none', 'main_type': 'questi...
body             Recently a friend tagged me in the comments of...
id                                                       t3_4dayqg
in_reply_to                                                    NaN
is_first_post                                                 True
majority_link                                                 none
majority_type                                             question
post_depth                             

annotations      [{'link_to_post': 'none', 'main_type': 'questi...
id                                                       t3_1zem12
in_reply_to                                                    NaN
is_first_post                                                 True
majority_link                                                 none
majority_type                                             question
post_depth                                                     NaN
Name: 0, dtype: object
annotations      [{'link_to_post': 'none', 'main_type': 'questi...
id                                                       t3_26c4be
in_reply_to                                                    NaN
is_first_post                                                 True
majority_link                                                 none
majority_type                                             question
post_depth                                                     NaN
Name: 0, dtype: object
annotations     

annotations      [{'link_to_post': 'none', 'main_type': 'questi...
body               From an atari to some random alienware pc. WOW!
id                                                        t3_ingdd
in_reply_to                                                    NaN
is_first_post                                                 True
majority_link                                                 none
majority_type                                             question
post_depth                                                     NaN
url              https://www.reddit.com/r/gaming/comments/ingdd...
Name: 0, dtype: object
annotations      [{'link_to_post': 'none', 'main_type': 'announ...
id                                                       t3_4ce2jg
in_reply_to                                                    NaN
is_first_post                                                 True
majority_link                                                 none
majority_type                          

Unnamed: 0,annotations,author,average_branch_length,body,branches_number,comments_number,id,in_reply_to,is_first_post,is_self_post,majority_link,majority_type,post_depth,subreddit,thread_author,thread_title,url
0,"[{'link_to_post': 'none', 'main_type': 'announ...",DTX120,3.0,4/7/13 \n\n7/27/12 \n\nhttp://www.imdb.com/t...,1,4,t3_1bx6qw,,True,True,none,announcement,,100movies365days,DTX120,DTX120: #87 - Nashville,https://www.reddit.com/r/100movies365days/comm...
1,"[{'link_to_post': 't3_1bx6qw', 'main_type': 'a...",mcgrewf10,3.0,I've wanted to watch this for a long time. I w...,1,4,t1_c9b2nyd,t3_1bx6qw,,True,t3_1bx6qw,elaboration,1.0,100movies365days,DTX120,DTX120: #87 - Nashville,
2,"[{'link_to_post': 't1_c9b2nyd', 'main_type': '...",DTX120,3.0,You strike me as the type who would appreciate...,1,4,t1_c9b30i1,t1_c9b2nyd,,True,t1_c9b2nyd,elaboration,2.0,100movies365days,DTX120,DTX120: #87 - Nashville,
3,"[{'link_to_post': 't1_c9b30i1', 'main_type': '...",mcgrewf10,3.0,"Yeah, I've always heard that Altman was famous...",1,4,t1_c9b6sj0,t1_c9b30i1,,True,t1_c9b30i1,elaboration,3.0,100movies365days,DTX120,DTX120: #87 - Nashville,
4,"[{'link_to_post': 'none', 'main_type': 'announ...",Keatonus,1.5,"Alright guys, little background about myself. ...",4,7,t3_omv7p,,True,True,none,announcement,,100sets,Keatonus,"Male, 23 years old. Going for 100 sets!",https://www.reddit.com/r/100sets/comments/omv7...


In [27]:
dpproc[dpproc['thread_author']=='[deleted author]'][dpproc['author'].notna()]

  """Entry point for launching an IPython kernel.


Unnamed: 0,annotations,author,body,id,in_reply_to,is_first_post,is_self_post,majority_link,majority_type,post_depth,subreddit,thread_author,thread_title,url


Here we can see that we get the same amount of posts as in 1.1 notebook.

In [18]:
len(dpproc)

936

## Columns cleaning

### is_first_post

Now we verify that `is_first_post` column is useless.

In [8]:
dpproc[dpproc['in_reply_to'].isna()]['is_first_post'].value_counts()

True    9482
Name: is_first_post, dtype: int64

So we drop it.

In [9]:
dpproc.drop(columns=['is_first_post'], inplace=True)

Acknowledging that _NaN_ values at the `in_reply_to` column mean that it is the first post

### post_depth

In [14]:
dpproc['post_depth'].fillna(0).value_counts()

1.0     51262
2.0     25651
3.0     14320
0.0      9482
4.0      7228
5.0      3830
6.0      2086
7.0      1182
8.0       657
9.0       401
10.0      254
Name: post_depth, dtype: int64

In [15]:
dpproc['post_depth'] = dpproc['post_depth'].fillna(0)

### body

If `body` is _NaN_ then the post is useless to us.

In [18]:
dpproc.dropna(subset=['body'], inplace=True)

AttributeError: 'NoneType' object has no attribute 'head'

In [20]:
len(dpproc)

114437

This means a 1,65% loss.

### author

We'll assume that _NaN_ values at this column mean that the user was deleted.

In [23]:
dpproc[dpproc['author'].isna()].head()

Unnamed: 0,annotations,author,body,id,in_reply_to,is_self_post,majority_link,majority_type,post_depth,subreddit,thread_title,url
6,"[{'link_to_post': 't3_omv7p', 'main_type': 'qu...",,More updates!,t1_c3qodom,t3_omv7p,True,t3_omv7p,question,1.0,100sets,"Male, 23 years old. Going for 100 sets!",
8,"[{'link_to_post': 't3_259tbh', 'main_type': 'a...",,>very specific items that aren't available eve...,t1_chrzb0h,t3_259tbh,True,t3_259tbh,appreciation,1.0,1200isplenty,122cal black currant cheesecake!,
11,"[{'link_to_post': 't1_chs0fj3', 'main_type': '...",,"Oh, we have gelatin, but I've never seen it in...",t1_chs51e3,t1_chs0eqa,True,t1_chs0eqa,elaboration,3.0,1200isplenty,122cal black currant cheesecake!,
15,"[{'link_to_post': 'none', 'main_type': 'questi...",,I can't find them anywhere! I just want the mo...,t3_16h61h,,True,none,question,0.0,1911,Need help finding a Springfield!,https://www.reddit.com/r/1911/comments/16h61h/...
21,"[{'link_to_post': 't1_c7w0wls', 'main_type': '...",,"Oh hell no. Sorry, I linked to the wrong pisto...",t1_c7w1x38,t1_c7w0wls,True,t1_c7w0wls,answer,2.0,1911,Need help finding a Springfield!,


In [27]:
dpproc[dpproc['author']=='deleted'].head()

Unnamed: 0,annotations,author,body,id,in_reply_to,is_self_post,majority_link,majority_type,post_depth,subreddit,thread_title,url


In [32]:
dpproc['author'] = dpproc['author'].fillna('deleted')

## Final version

In [1]:
dpproc.head()

NameError: name 'dpproc' is not defined