In [9]:
#importing important packages

import numpy as np
import pandas as pd
import pickle

In [2]:
# Reading in raw data that has not been processed at all
# DO NOT RUN THIS IF YOU'VE ALREADY STARTED PROCESSING

data = pd.read_csv("C:/Users/Andre/Documents/DBL/Reaching Understanding Study/merged_data_allsubs_wtext.csv", encoding_errors= 'replace')

In [3]:
# Global variables zone:
#     Tweak these to change little details about how the program runs

FILE_SAVE_LOCATION: str = "C:/Users/Andre/Documents/DBL/Reaching Understanding Study/Conversations/"      # Each post will saved into this folder as an individual .csv named after its post id.
CMT_CHAIN_LEN: int = 4                                        # Filter for comment chains of the length specified here.
IMPORTANT_COLUMNS: list = ["cmt_id","submission_title","text","submission_link_id","created_utc","author","author_id","cmt_link_id","cmt_parent_id"] # Keeps only columns noted here, the other ones are not useful

In [4]:
# Function definition zone
# Just run this

def remove_t(element):
  """Remove the 't3_' 't2_' and 't1_' from all cells that have them as a prefix to the data we want to use. - andre.chiquit.ooo"""
  if type(element) == str:
    if element[2] == "_":
      return (element[3:])
    else: 
      return (element)
  else:
    return (element)


def pull_post(data: pd.DataFrame, post_id: str) -> pd.DataFrame:
  """Creates a DataFrame out of a specific post_id."""
  return data[data['submission_link_id']==post_id]


def find_top_level_comments(data: pd.DataFrame) -> pd.DataFrame:
  """Finds all the top level comments."""
  return  data[data["cmt_parent_id"] == data["submission_link_id"]]


def find_children(data: pd.DataFrame, parent_id: str) -> pd.DataFrame:
  """Finds all the child comments from a certain comment."""
  return data[data["cmt_parent_id"]==parent_id]


def assemble_children(data: pd.DataFrame, top_comment_id: str, parent_df: pd.DataFrame) -> pd.DataFrame:
  """Creates a data frame for all responses to top level comment."""
  children = find_children(data, top_comment_id)
  parent_df = parent_df.append(children, ignore_index=True)
  for child in children["cmt_id"]:
    parent_df=assemble_children(data, child, parent_df)
  return parent_df


def multi_data_frame(data: pd.DataFrame) -> pd.DataFrame:
  """Function to turn data frame into a 3d data frame by parent comment. This one is kinda cool, but it makes the data wayyyy harder to work with."""
  top_cmts=find_top_level_comments(data)
  top_comment_names=top_cmts["cmt_id"]
  top_comment_names.name="top_comment_id"
  all_dfs=[]
  for top_cmt in top_cmts["cmt_id"]:
    starting_df=data[data["cmt_id"]==top_cmt]
    comment_replies=assemble_children(data,top_cmt,starting_df)
    all_dfs.append(comment_replies)
  final_dfs = pd.concat(all_dfs, axis=0, keys=top_comment_names, ignore_index=False)
  return final_dfs


def is_not_top(comment: pd.DataFrame) -> bool:
  """Checks whether a comment is a top level comment."""
  if (comment["cmt_parent_id"].equals(comment["submission_link_id"])):
    return False
  else:
    return True


def grab_parent_comment(data: pd.DataFrame, current_comment: pd.DataFrame) -> pd.DataFrame:
  """grabs parent comment and returns it."""
  # print(f"Variable: data            - the entire dataframe for the post in question           : {data}")
  # print(f"Variable: current_comment - a dataframe with a single row that's the active comment : {current_comment}")
  if (is_not_top(current_comment)):
    return data[data["cmt_id"]==current_comment["cmt_parent_id"].item()]
  else:
    return


def get_parent_cmt_ids(data: pd.DataFrame, current_comment: pd.DataFrame, comment_chain) -> list:
  """function to add comment id to chain."""
  parent = grab_parent_comment(data, current_comment)
  
  if parent is not None:
    # print(f"Variable: data            - the entire dataframe for the post in question (I think...) : {data}")
    # print(f"Variable: current_comment - a dataframe with a single row that's the active comment    : {current_comment}")
    # print(f"Variable: comment_chain   - a list with comment IDs in it                              : {comment_chain}") # Debug??
    # print(f"Variable: parent          - a dataframe resulting from running grab_parent_comment     : {parent}")
    comment_chain.append(parent["cmt_id"].item())
    for par in parent["cmt_id"]:
      get_parent_cmt_ids(data,data[data["cmt_id"]==par],comment_chain)
  return comment_chain


def get_parent_author_ids(data: pd.DataFrame, current_comment: pd.DataFrame, author_chain) -> list:
  """function to add author id to chain."""
  parent = grab_parent_comment(data, current_comment)
  
  if parent is not None:
    author_chain.append(parent["author_id"].item())
    for par in parent["cmt_id"]:
      get_parent_author_ids(data,data[data["cmt_id"]==par],author_chain)
  return author_chain


def add_cmt_aut_chains(data: pd.DataFrame) -> pd.DataFrame:
  """Adds comment and author chain to regular data frame."""
  final_data=data
  final_data["cmt_chain"]=[[] for _ in range(final_data.shape[0])]
  final_data["author_chain"]=[[] for _ in range(final_data.shape[0])]
  for comment in data["cmt_id"]:
      this_comment=data[data["cmt_id"]==comment]
      
      #initialize comment and author chain variables
      cmt_chain=[]
      aut_chain=[]
      cmt_chain.append(this_comment["cmt_id"].item())
      aut_chain.append(this_comment["author_id"].item())

      #create chains
      comment_chain=get_parent_cmt_ids(data,this_comment,cmt_chain)
      author_chain=get_parent_author_ids(data,this_comment,aut_chain)

      #add chains to data
      final_data.at[final_data[final_data["cmt_id"]==this_comment["cmt_id"].item()].index.item(),"cmt_chain"]=comment_chain
      final_data.at[final_data[final_data["cmt_id"]==this_comment["cmt_id"].item()].index.item(),"author_chain"]=author_chain
  return final_data


def remove_orphans(data: pd.DataFrame) -> pd.DataFrame:
  """Remove comments that have no parent."""
  truthValues=[]
  # print(f"Data currently has {len(data)} comments")
  for par in data["cmt_parent_id"]:
    if data["cmt_id"].str.contains(par).any() | data["submission_link_id"].str.contains(par).any():
      truthValues.append(True)
    else:
      truthValues.append(False)
  data = data[truthValues]
  # print(truthValues)
  # print(f"After removing orphans, data now has {len(data)} comments")
  if False in truthValues:
    data = remove_orphans(data)
  return data


def add_unique_authors(data: pd.DataFrame) -> pd.DataFrame:
  """Add unique authors column."""
  data=data.reset_index(drop = True)
  for i in range(len(data.index)):
    data.at[i,"unique_authors"]=list(dict.fromkeys(data.at[i,"author_chain"]))
  return data


def add_cmt_aut_chain_strings(data: pd.DataFrame) -> pd.DataFrame:
  """Adds author and comment chains as string."""
  final_df=data
  for comment in data["cmt_id"]:
    this_comment=data[data["cmt_id"]==comment]
    final_df.at[final_df[final_df["cmt_id"]==this_comment["cmt_id"].item()].index.item(),"cmt_chain_string"]="_".join(this_comment["cmt_chain"].item())
    final_df.at[final_df[final_df["cmt_id"]==this_comment["cmt_id"].item()].index.item(),"author_chain_string"]="_".join(this_comment["author_chain"].item())
  return final_df

def troubleshooting_add_cmt_aut_chain_strings(data: pd.DataFrame) -> pd.DataFrame:
  """Adds author and comment chains as string."""
  final_df=data
  counter: int = 0
  for comment in data["cmt_id"]:
    print(f"Currently processing [{counter}/{len(data.index)}]")
    this_comment=data[data["cmt_id"]==comment]
    final_df.at[final_df[final_df["cmt_id"]==this_comment["cmt_id"].item()].index.item(),"cmt_chain_string"]="_".join(this_comment["cmt_chain"].item())
    final_df.at[final_df[final_df["cmt_id"]==this_comment["cmt_id"].item()].index.item(),"author_chain_string"]="_".join(this_comment["author_chain"].item())
    counter += 1
  return final_df


def is_not_found_later(data: pd.DataFrame, comment_chain_str: str):
  """Determine if string of comments is in a later comment."""
  is_not_found=True
  temp_data=data[data["cmt_chain_string"] != comment_chain_str]
  for cmt_chain_str in temp_data["cmt_chain_string"]:
    if comment_chain_str in cmt_chain_str:
      is_not_found=False
  return is_not_found


def remove_duped_comment_chains(data: pd.DataFrame) -> pd.DataFrame:
  """Remove comment chains that are found later."""
  truthValues=[]
  for cmt_chain in data["cmt_chain_string"]:
    if is_not_found_later(data,cmt_chain):
      truthValues.append(True)
    else:
      truthValues.append(False)
  return data[truthValues]


def pull_all_conversations(data: pd.DataFrame) -> pd.DataFrame:
  """Pull all conversations that are at least 2 unique authors."""
  # The output sometimes ends up with conversations that are just two unique authors, but more often it seems to have many more than two. I'm going to leave this code as is
  #     and write a new function to filter the number of authors down to two 
  final_convos=data[data["unique_authors"].map(len)==2]

  messy_convos=data[data["unique_authors"].map(len)>2]
  for mess in messy_convos["cmt_id"]:
    author_chain=messy_convos[messy_convos["cmt_id"]==mess]["author_chain"].item()
    unique_set=messy_convos[messy_convos["cmt_id"]==mess]["unique_authors"].item()[0:2] #this grabs the first two authors which is all we care about
    i=0
    while(author_chain[i] in unique_set):
      i=i+1
    new_comment_line=messy_convos[messy_convos["cmt_id"]==mess]
    new_comment_line.at[new_comment_line.index.item(),"cmt_chain"]=new_comment_line["cmt_chain"].item()[0:i]
    new_comment_line.at[new_comment_line.index.item(),"author_chain"]=new_comment_line["author_chain"].item()[0:i]
    final_convos=pd.concat([final_convos,new_comment_line],ignore_index=True)

  final_convos=add_cmt_aut_chain_strings(final_convos)
  
  final_convos=remove_duped_comment_chains(final_convos)
  
  return final_convos


def add_cmt_chain_len(data: pd.DataFrame) -> pd.DataFrame:
  """Adds a column denoting the length of each comment chain. - andre.chiquit.ooo"""
  data=data.reset_index(drop = True)
  for i in range(len(data.index)):
    x = len(data.at[i,"cmt_chain"])
    data.at[i,"cmt_chain_len"] = x
  return data

def unique_cells(data: pd.DataFrame) -> list:
  """Returns a list of every unique value in a dataframe. If you want to use this on one column, which you probably do, make sure to call it on one column. Enables us to avoid using groupby. - andre.chiquit.ooo"""
  authors: list = []
  x: int = 0
  for element in data:
    if (x % 250000) == 0:
      if x < 1000000:
        print(f".{str(x)[:2]} million rows out of ~{str(len(data.index))[:1]}.{str(len(data.index))[1:2]} million total rows processed.")
      else:
        print(f"{str(x)[:1]}.{str(x)[1:3]} million rows out of ~{str(len(data.index))[:1]}.{str(len(data.index))[1:2]} million total rows processed.")
    x += 1
    if element not in authors:
      authors.append(element)
  return (authors)

def add_convo_metadata(data: pd.DataFrame, convos:pd.DataFrame) -> pd.DataFrame:
  """Takes a dataframe of comments and adds columns to each comment to include metadata on the conversations to which they belong, taken from a second dataframe. - andre.chiquit.ooo"""
  x: int = 0
  z: int = 0
  data.reset_index(inplace = True, drop = True)
  convos.reset_index(inplace = True, drop = True)
  for i in range(len(convos.index)):
    x = 0
    while x < len(convos.at[i, "cmt_chain"]):
      for comment in convos.at[i, "cmt_chain"]:
        z = 0
        while z < len(data.index):
          if data.at[z, "cmt_id"] == comment:
            data.at[z, "convo_id"] = convos.at[i, "cmt_chain_string"]
            data.at[z, "unique_authors"] = convos.at[i, "unique_authors"]
            data.at[z, "cmt_chain_len"] = convos.at[i, "cmt_chain_len"]
            z += len(data.index)
          z += 1
      x += 1
  return data

def cmt_chain_len_filter(data: pd.DataFrame, convo_len: int) -> pd.DataFrame:
  """Filters a dataframe for comment chains of a certain length. - andre.chiquit.ooo"""
  mask = []
  for i in range(len(data.index)):
    if data.at[i, "cmt_chain_len"] < convo_len:
      mask.append(False)
    else:
      mask.append(True)
  data = data[mask]
  return data

def dyadic_convo_filter(data: pd.DataFrame) -> pd.DataFrame:
  """Filters a dataframe for only those conversations that have two authors. - andre.chiquit.ooo"""
  mask = []
  for i in range(len(data.index)):
    if len(data.at[i, "unique_authors"]) == 2:
      mask.append(True)
    else:
      mask.append(False)
  data = data[mask]
  return data

def filter_out_comments(data: pd.DataFrame) -> pd.DataFrame:
  """Filters out any comments that aren't related to the identified conversations and organizes them based on conversation and creation time. - andre.chiquit.ooo"""
  mask = []
  data.reset_index(inplace = True, drop = True)
  for i in range(len(data.index)):
    if type(data.at[i, "convo_id"]) == str:
      mask.append(True)
    else:
      mask.append(False)
  data = data[mask]
  data = data.sort_values(by =['convo_id', 'created_utc'])
  return data

def processing_for_coding(data: pd.DataFrame) -> None:
  """Function to use on a dataset to convert it to a dataset with comment chain conversations grouped, ready to be coded. Does not make use of groupby. Intended to be run start to finish on an entire dataset, may take a loooong time."""
  print("Filtering out for important columns only.")
  data = data[IMPORTANT_COLUMNS]
  data = data.dropna()
  print("Removing 't1_', 't2_', and 't3_' from the cells in which they appear")
  data[['author_id','cmt_link_id','cmt_parent_id']]=data[['author_id','cmt_link_id','cmt_parent_id']].applymap(remove_t)
  print("Making a list of all the post IDs included in the data")
  post_ids = unique_cells(data["submission_link_id"])
  post_index: int = 1
  posts = len(post_ids)
  print("Time to process each post... Hang on!")
  for post in post_ids:
    if post_index % 100 == 0:
      print(f"Currently processing post_id {post} ({post_index}/{posts}). {posts - post_index} posts left . . .")
    data1 = pull_post(data, post)
    if len(data1.index) > 0:
      data1 = remove_orphans(data1)
      if len(data1.index) > 0:
        data1 = add_cmt_aut_chains(data1)
        data1 = add_unique_authors(data1)
        data1 = add_cmt_aut_chain_strings(data1)
        data_convos = pull_all_conversations(data1)
        data_convos = add_cmt_chain_len(data_convos)
        data_convos = dyadic_convo_filter(data_convos)
        data_convos = cmt_chain_len_filter(data_convos, CMT_CHAIN_LEN)
        if len(data_convos.index) > 0:
          data1 = add_convo_metadata(data1, data_convos)
          data1.reset_index(drop = True, inplace = True)
          data1 = filter_out_comments(data1)
          data1.to_csv(FILE_SAVE_LOCATION + f"{post}_conversations.csv")
    post_index += 1
  return None

def processing_for_coding_after_list(data: pd.DataFrame, post_ids: list) -> None:
  """Uses the list of post id method rather than groupby. Intended to pick up where you left off after cleaning the data and generating a list of all post_ids included in the dataset."""
  post_index: int = 1
  posts = len(post_ids)
  print("Time to process each post... Hang on!")
  for post in post_ids:
    if (post_index % 100) == 0:
      print(f"Currently processing post_id {post} ({post_index}/{posts}). {posts - post_index} posts left . . .")
    data1 = pull_post(data, post)
    if len(data1.index) > 0:
      data1 = remove_orphans(data1)
      if len(data1.index) > 0:
        data1 = add_cmt_aut_chains(data1)
        data1 = add_unique_authors(data1)
        data1 = add_cmt_aut_chain_strings(data1)
        data_convos = pull_all_conversations(data1)
        data_convos = add_cmt_chain_len(data_convos)
        data_convos = dyadic_convo_filter(data_convos)
        data_convos = cmt_chain_len_filter(data_convos, CMT_CHAIN_LEN)
        if len(data_convos.index) > 0:
          data1 = add_convo_metadata(data1, data_convos)
          data1.reset_index(drop = True, inplace = True)
          data1 = filter_out_comments(data1)
          data1.to_csv(FILE_SAVE_LOCATION + f"{post}_conversations.csv")
    post_index += 1
  return None

def processing_for_coding_troubleshooting(data: pd.DataFrame, post_id: str) -> None:
  """Identical to the function above, but for use with troubleshooting a specific post id."""
  data1 = pull_post(data, post_id)
  print(f"Currently processing post: {post_id}")
  if len(data1.index) > 0:
    print("Removing orphans!")
    data1 = remove_orphans(data1)
    if len(data1.index) > 0:
      print("Adding comment/author chains!")
      data1 = add_cmt_aut_chains(data1)
      print("Adding unique authors!")
      data1 = add_unique_authors(data1)
      print("Adding comment/author chain strings!")
      data1 = add_cmt_aut_chain_strings(data1)
      print("Pulling all conversations!")
      data_convos = pull_all_conversations(data1)
      print("Adding comment chain length!")
      data_convos = add_cmt_chain_len(data_convos)
      print("Filtering out non-dyadic conversations!")
      data_convos = dyadic_convo_filter(data_convos)
      print("Filtering for length of comment chain!")
      data_convos = cmt_chain_len_filter(data_convos, CMT_CHAIN_LEN)
      if len(data_convos.index) > 0:
        print("Adding conversation metadata!")
        data1 = add_convo_metadata(data1, data_convos)
        data1.reset_index(drop = True, inplace = True)
        data1 = filter_out_comments(data1)
        print("Saving the file!")
        data1.to_csv(FILE_SAVE_LOCATION + f"{post_id}_conversations.csv")
  print("Made it to the end!")
  return None

def processing_posts(data: pd.DataFrame) -> None:
  """Processes each post. Meant for use with groupby. WORK IN PROGRESS."""
  print("Made it all the way to processing_posts function!")
  print(data)
  data = remove_orphans(data)
  print("I guess it removed the orphans man")
  data = add_cmt_aut_chains(data)
  data = add_unique_authors(data)
  data = add_cmt_aut_chain_strings(data)
  data_convos = pull_all_conversations(data)
  data_convos = add_cmt_chain_len(data_convos)
  data_convos = dyadic_convo_filter(data_convos)
  data_convos = cmt_chain_len_filter(data_convos, CMT_CHAIN_LEN)
  if len(data_convos.index) > 0:
    data = add_convo_metadata(data, data_convos)
    data.reset_index(drop = True, inplace = True)
    data = filter_out_comments(data)
    data.to_csv(FILE_SAVE_LOCATION + f"{data.at[0, 'submission_link_id']}_conversations.csv")
  return None


def processing_for_coding_groupby(data: pd.DataFrame) -> None:
  """Processes Reddit comments and makes use of the groupby method. WORK IN PROGRESS."""
  print("Filtering out for important columns only.")
  data = data[IMPORTANT_COLUMNS]
  data = data.dropna()
  print("Removing 't1_', 't2_', and 't3_' from the cells in which they appear")
  data[['author_id','cmt_link_id','cmt_parent_id']]=data[['author_id','cmt_link_id','cmt_parent_id']].applymap(remove_t)
  groups = data.groupby(['submission_link_id'], sort = False)
  groups.transform(processing_posts)
  return None

The code below is trying to make groupby work

In [None]:
processing_for_coding_groupby(data)

The code below will work, but it will take a while. It would be more efficient to use groupby, which I was playing with creating a function to do that above.

In [None]:
# Filters out the data into important columns only
# Drops rows with any NA values
# Remove 't1_' etc prefixes
# Creates a list of all the post_ids in the dataset

print("Filtering out for important columns only.")
data = data[IMPORTANT_COLUMNS]
data = data.dropna()
print("Removing 't1_', 't2_', and 't3_' from the cells in which they appear")
data[['author_id','cmt_link_id','cmt_parent_id']]=data[['author_id','cmt_link_id','cmt_parent_id']].applymap(remove_t)
print("Making a list of all the post IDs included in the data")
post_ids = unique_cells(data["submission_link_id"])

In [8]:
# Saves the modified dataset and list of post_ids so that you don't have to run the lengthy code above again

data.to_csv("C:/Users/Andre/Documents/DBL/Reaching Understanding Study/no_t_data.csv")

with open("C:/Users/Andre/Documents/DBL/Reaching Understanding Study/post_id_list", "wb") as post_id_list:
    pickle.dump(post_ids, post_id_list)

In [None]:
# Running through all the data and cutting short the post_id list so that I can start where I left off last time. Hopefully this won't be necessary after debugging.

post_ids_progress = post_ids[14500:]

In [12]:
# Run this to load the pre-modified dataset and the post_id list

data = pd.read_csv("C:/Users/Andre/Documents/DBL/Reaching Understanding Study/no_t_data.csv", encoding_errors= 'replace')

with open("C:/Users/Andre/Documents/DBL/Reaching Understanding Study/post_id_list", "rb") as save_file:
    test_post_ids = pickle.load(save_file)

113193


In [13]:
# Processing through the data. If you're picking up where you left off, make sure to modify the post_ids_progress assignment block two blocks up

processing_for_coding_after_list(data, post_ids_progress)

Time to process each post... Hang on!
Currently processing post_id orogzi (50/98693). 98643 posts left . . .
Currently processing post_id otdm51 (100/98693). 98593 posts left . . .
Currently processing post_id oukx4a (150/98693). 98543 posts left . . .
Currently processing post_id ovtr2i (200/98693). 98493 posts left . . .
Currently processing post_id ox4ttb (250/98693). 98443 posts left . . .
Currently processing post_id oymghx (300/98693). 98393 posts left . . .
Currently processing post_id ozs30q (350/98693). 98343 posts left . . .
Currently processing post_id p15k5e (400/98693). 98293 posts left . . .
Currently processing post_id p2cmfx (450/98693). 98243 posts left . . .
Currently processing post_id p3c97l (500/98693). 98193 posts left . . .
Currently processing post_id p4g5du (550/98693). 98143 posts left . . .
Currently processing post_id p63fah (600/98693). 98093 posts left . . .
Currently processing post_id p760ao (650/98693). 98043 posts left . . .
Currently processing post_i

In [None]:
# Experimenting with the groupby functions

data = pd.read_csv("C:/Users/Andre/Documents/DBL/Reaching Understanding Study/no_t_data.csv", encoding_errors= 'replace')
groups = data.groupby(['submission_link_id'], sort = False)
groups.transform(processing_posts)

In [None]:
# Troubleshooting posts that cause the code to error out

processing_for_coding_troubleshooting(data, "k87ecd")

+ Next PyClub after thanksgiving
    + Thanks Connor!!
    + What do we want to hand-label (covid data?)
    + Make a codebook for how we code things
        + Identify disagreement and agreement
        + Label our actual predictors/outcomes
        + We'll probably use both NLP and hand-labeling 
            + Hand label the outcome (do they reach agreement?)
            + Use NLP to measure predictors
    + We have example data for what we'll be coding
        + Develop a codebook - what constructs do we want to measure manually and what do we want to measure with automated text analysis
        + We have lots of conversations on covid, do we want any other issues as well?

+ Code Book Ideas (measure perceived understanding)
    + Label for agree/disagree based on the first comment
        + Strength of disagreement/agreement; 5-point scale
    + Do they seem to accurately understand the other persons views?
    + How well do they seem to understand what the other person believes?
    + How well do they seem to understand the other persons values?
    + How well do they seem to understand what the other person is saying?
    + Measure based on only what they are saying, not what you think they understand deep down