In [19]:
#import statements
import json
import numpy as np
import random
import re
import spacy

from nltk.stem import PorterStemmer
from textblob import TextBlob

In [20]:
# get training dataset 
with open("train-data-prepared.json", "r") as f:
    train_data = json.load(f)
# get training dataset 
with open("val-data-prepared.json", "r") as f:
    val_data = json.load(f)
# get testing dataset 
with open("val-data-prepared.json", "r") as f:
    test_data = json.load(f)


In [21]:
#create spacy object
nlp_english = spacy.load("en_core_web_sm")
#create Stemmer object
stemmer = PorterStemmer()

In [34]:
# remove punctuation, space, urls from text
def clean_text(text):
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    parsed_text = nlp_english(text)
    clean_text = []
    for token in parsed_text:
        stop_flag = (token.is_punct or token.is_space or  
                 token.like_url)
        if (not stop_flag):
            clean_text.append(token.text.lower())
    return clean_text

def stem_text(text):
    return [stemmer.stem(word) for word in clean_text(text)]

print(clean_text("> a) right, because women are non-sexual creatures who would never use prostitutes themselves\n\ni think you vastly overestimate the number of women that pay for sex..."))

['a', 'right', 'because', 'women', 'are', 'non', 'sexual', 'creatures', 'who', 'would', 'never', 'use', 'prostitutes', 'themselves', 'i', 'think', 'you', 'vastly', 'overestimate', 'the', 'number', 'of', 'women', 'that', 'pay', 'for', 'sex']


In [35]:
# checks if the OP has addressed the other person in some way
def check_you_text(text):
    flag = 0
    keywords = ['you','your','you\'re']
    for word in text:
        if word in keywords:
            flag = 1
    return flag
        
print(check_you_text(clean_text("> a) right, because women are non-sexual creatures who would never use prostitutes themselves\n\ni think you vastly overestimate the number of women that pay for sex...")))

1


In [43]:
def gather_data(thread):
    # to store individual posts from user
    comments_analysis = []
    # used later to calculate avg sentiment of the argument
    sentiment_array = []
    subjective_array = []
    length_array = []
    for comment in thread["preceding_posts"]:
        comment_data = {}
        comment_data["OP"] = comment["author_name"]
        comment_data["text"] = stem_text(comment["body"])
        sentiment = TextBlob(' '.join(comment_data["text"])).sentiment
        comment_data["length"] = len("".join(clean_text(comment["body"])))
        comment_data["types_you"] = check_you_text(comment_data["text"])
        comment_data["sentiment"] = sentiment.polarity
        comment_data["subjectivity"] = sentiment.subjectivity
        
        length_array.append(comment_data["length"])
        sentiment_array.append(comment_data["sentiment"])
        subjective_array.append(comment_data["subjectivity"])
        comments_analysis.append(comment_data)
        
    
    
    return {
        'id': thread["id"],
        'no_of_arguments': len(thread["preceding_posts"]),
        'combined_length': int(np.sum(np.array(length_array))),
        'avg_sentiment': np.mean(np.array(sentiment_array)),
        'stddev_sentiment': np.std(np.array(sentiment_array)),
        'avg_subjectivity': np.mean(np.array(subjective_array)),
        'stddev_subjectivity': np.std(np.array(subjective_array)),
        'positivity': int(np.floor(sentiment_array[-1] - sentiment_array[0])),
        'label': thread["label"],
        'comment_vectors': comments_analysis
    }

In [44]:
# print features for some 10 tuples
for thread in train_data[:7]:
    print(json.dumps(gather_data(thread), indent=2))

{
  "id": "t1_dggp3q9",
  "no_of_arguments": 3,
  "combined_length": 550,
  "avg_sentiment": 0.19547258297258296,
  "stddev_sentiment": 0.16038895517928353,
  "avg_subjectivity": 0.34530423280423284,
  "stddev_subjectivity": 0.2794887937300273,
  "positivity": -1,
  "label": 1,
  "comment_vectors": [
    {
      "OP": "qwertx0815",
      "text": [
        "a",
        "right",
        "becaus",
        "women",
        "are",
        "non",
        "sexual",
        "creatur",
        "who",
        "would",
        "never",
        "use",
        "prostitut",
        "themselv",
        "i",
        "think",
        "you",
        "vastli",
        "overestim",
        "the",
        "number",
        "of",
        "women",
        "that",
        "pay",
        "for",
        "sex"
      ],
      "length": 132,
      "types_you": 1,
      "sentiment": 0.39285714285714285,
      "subjectivity": 0.6845238095238095
    },
    {
      "OP": "bunchanumbersandshit",
      "text": [
       

{
  "id": "t1_cpzy2ya",
  "no_of_arguments": 2,
  "combined_length": 5357,
  "avg_sentiment": -0.020713722041847044,
  "stddev_sentiment": 0.005387468434343436,
  "avg_subjectivity": 0.3795238095238095,
  "stddev_subjectivity": 0.03979166666666664,
  "positivity": 0,
  "label": 0,
  "comment_vectors": [
    {
      "OP": "DBDude",
      "text": [
        "real",
        "properti",
        "ha",
        "a",
        "logic",
        "absolut",
        "necess",
        "to",
        "be",
        "regist",
        "under",
        "what",
        "logic",
        "in",
        "an",
        "environ",
        "where",
        "properti",
        "ownership",
        "exist",
        "we",
        "need",
        "to",
        "know",
        "what",
        "properti",
        "belong",
        "to",
        "what",
        "person",
        "i",
        "need",
        "to",
        "know",
        "whether",
        "i",
        "m",
        "build",
        "my",
        "hous",
   

{
  "id": "t1_cimh75z",
  "no_of_arguments": 2,
  "combined_length": 1243,
  "avg_sentiment": 0.2340178571428571,
  "stddev_sentiment": 0.022410714285714284,
  "avg_subjectivity": 0.4665178571428571,
  "stddev_subjectivity": 0.020089285714285726,
  "positivity": -1,
  "label": 0,
  "comment_vectors": [
    {
      "OP": "",
      "text": [
        "those",
        "are",
        "not",
        "posit",
        "right",
        "for",
        "one",
        "you",
        "are",
        "conflat",
        "law",
        "and",
        "right",
        "law",
        "are",
        "a",
        "valid",
        "manifest",
        "of",
        "right",
        "or",
        "at",
        "least",
        "the",
        "best",
        "real",
        "world",
        "exampl",
        "of",
        "their",
        "exist",
        "second",
        "those",
        "are",
        "thing",
        "that",
        "simpli",
        "prevent",
        "you",
        "from",
        "harm"

In [32]:
# random classification for baseline score

val_ids = [thread["id"] for thread in val_data]
test_ids = [thread["id"] for thread in test_data]

random_val = {t_id: random.randint(0,1) for t_id in val_ids}
random_test = {t_id: random.randint(0,1) for t_id in test_ids}


In [33]:
with open("val-output.json", "w") as f:
    json.dump(random_val, f)
# get testing dataset 
with open("test-output.json", "w") as f:
    json.dump(random_test, f)
