In [41]:
#import statements
import json
import numpy as np
import random
import spacy
from textblob import TextBlob

In [42]:
# get training dataset 
with open("train-data-prepared.json", "r") as f:
    train_data = json.load(f)
# get training dataset 
with open("val-data-prepared.json", "r") as f:
    val_data = json.load(f)
# get testing dataset 
with open("val-data-prepared.json", "r") as f:
    test_data = json.load(f)


In [43]:
#create spacy object
nlp_english = spacy.load("en_core_web_sm")

In [44]:
# remove punctuation, space, urls from text
def clean_text(text):
    parsed_text = nlp_english(text)
    clean_text = []
    for token in parsed_text:
        stop_flag = (token.is_punct or token.is_space or  
                 token.like_url or token.text == '>')
        if (not stop_flag):
            clean_text.append(token.text.lower())
    return clean_text

print(clean_text("> a) right, because women are non-sexual creatures who would never use prostitutes themselves\n\ni think you vastly overestimate the number of women that pay for sex..."))

['a', 'right', 'because', 'women', 'are', 'non', 'sexual', 'creatures', 'who', 'would', 'never', 'use', 'prostitutes', 'themselves', 'i', 'think', 'you', 'vastly', 'overestimate', 'the', 'number', 'of', 'women', 'that', 'pay', 'for', 'sex']


In [45]:
# checks if the OP has addressed the other person in some way
def check_you_text(text):
    flag = 0
    keywords = ['you','your','you\'re']
    for word in text:
        if word in keywords:
            flag = 1
    return flag
        
print(check_you_text(clean_text("> a) right, because women are non-sexual creatures who would never use prostitutes themselves\n\ni think you vastly overestimate the number of women that pay for sex...")))

1


In [36]:
def gather_data(thread):
    # to store individual posts from user
    comments_analysis = []
    # used later to calculate avg sentiment of the argument
    sentiment_array = []
    length_array = []
    for comment in thread["preceding_posts"]:
        comment_data = {}
        comment_data["OP"] = comment["author_name"]
        comment_data["text"] = clean_text(comment["body"])
        comment_data["length"] = len(comment["body"])
        comment_data["types_you"] = check_you_text(comment_data["text"])
        comment_data["sentiment"] = TextBlob(' '.join(comment_data["text"])).sentiment.polarity
        
        length_array.append(comment_data["length"])
        sentiment_array.append(comment_data["sentiment"])
        comments_analysis.append(comment_data)
    
    
    return {
        'id': thread["id"],
        'no_of_arguments': len(thread["preceding_posts"]),
        'combined_length': int(np.sum(np.array(length_array))),
        'avg_sentiment': np.mean(np.array(sentiment_array)),
        'stddev_sentiment': np.std(np.array(sentiment_array)),
        'label': thread["label"],
        'comment_vectors': comments_analysis
    }

In [46]:
# print features for some 10 tuples
for thread in train_data[:3]:
    print(json.dumps(gather_data(thread), indent=4))

{
    "id": "t1_dggp3q9",
    "no_of_arguments": 3,
    "combined_length": 708,
    "avg_sentiment": -0.047871572871572864,
    "stddev_sentiment": 0.32692792804230514,
    "label": 1,
    "comment_vectors": [
        {
            "OP": "qwertx0815",
            "text": [
                "a",
                "right",
                "because",
                "women",
                "are",
                "non",
                "sexual",
                "creatures",
                "who",
                "would",
                "never",
                "use",
                "prostitutes",
                "themselves",
                "i",
                "think",
                "you",
                "vastly",
                "overestimate",
                "the",
                "number",
                "of",
                "women",
                "that",
                "pay",
                "for",
                "sex"
            ],
            "length": 166,
            "

In [47]:
# random classification for baseline score

val_ids = [thread["id"] for thread in val_data]
test_ids = [thread["id"] for thread in test_data]

random_val = {t_id: random.randint(0,1) for t_id in val_ids}
random_test = {t_id: random.randint(0,1) for t_id in test_ids}


In [49]:
with open("val-output.json", "w") as f:
    json.dump(random_val, f)
# get testing dataset 
with open("test-output.json", "w") as f:
    json.dump(random_test, f)
