In [1]:
#import statements
import argparse
import json
import numpy as np
import re
import spacy

from better_profanity import profanity
from nltk.stem import PorterStemmer
from numpy import dot
from numpy.linalg import norm
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from textblob import TextBlob
from tqdm import tqdm

In [2]:
# get training dataset 
with open("train-data-prepared.json", "r") as f:
    train_data = json.load(f)
# get training dataset 
with open("val-data-prepared.json", "r") as f:
    val_data = json.load(f)
# get testing dataset 
with open("val-data-prepared.json", "r") as f:
    test_data = json.load(f)


In [3]:
#create spacy object
nlp_english = spacy.load("en_core_web_sm")
#create Stemmer object
stemmer = PorterStemmer()

In [4]:
entire_dataset = {
    'train_ids': [thread["id"] for thread in train_data],
    'train_posts': [thread["preceding_posts"] for thread in train_data],
    'train_label': [thread["label"] for thread in train_data],
    
    'val_ids': [thread["id"] for thread in val_data],
    'val_posts': [thread["preceding_posts"] for thread in val_data],
    'val_label': [thread["label"] for thread in val_data],
    
    'test_ids': [thread["id"] for thread in test_data],
    'test_posts': [thread["preceding_posts"] for thread in test_data],
    'test_label': [thread["label"] for thread in test_data],
}

print(entire_dataset.keys())

dict_keys(['train_ids', 'train_posts', 'train_label', 'val_ids', 'val_posts', 'val_label', 'test_ids', 'test_posts', 'test_label'])


In [5]:
#for thread in val_data:
#    authors = []
#    for comment in thread["preceding_posts"]:
#        if comment["author_name"] not in authors:
#            authors.append(comment["author_name"])
#    
#    if len(authors) >= 2:
#        print(thread["id"])

In [6]:
# remove punctuation, space, urls from text
def clean_text(text):
    parsed_text = nlp_english(text)
    clean_text = []
    for token in parsed_text:
        stop_flag = (token.is_punct or token.is_space or  
                 token.like_url)
        if (not stop_flag):
            clean_text.append(re.sub('[^A-Za-z0-9]+', ' ',token.text.lower()))
            
    return clean_text

def stem_text(text):
    return [stemmer.stem(word) for word in clean_text(text)]

print(stem_text("ass idiot fuck shit racist "))
print(stem_text("...because it's illegal in our reality, vs. the proposed reality that me, OP, and everyone else on this thread are commenting about.\n\nKeep up, qwert"))

['ass', 'idiot', 'fuck', 'shit', 'racist']
['becaus', 'it', ' s', 'illeg', 'in', 'our', 'realiti', 'vs ', 'the', 'propos', 'realiti', 'that', 'me', 'op', 'and', 'everyon', 'els', 'on', 'thi', 'thread', 'are', 'comment', 'about', 'keep', 'up', 'qwert']


In [7]:
# checks if the OP has addressed the other person in some way
def count_insults(text):
    insult_words = ["ass", "idiot", "fuck", "shit"]
    counter = 0
    for word in text:
        if word in insult_words:
            counter = counter + 1
            
    return counter
        
#print(check_insults(clean_text("> a) right, because women are non-sexual creatures who would never use prostitutes themselves\n\ni think you vastly overestimate the number of women that pay for sex...")))

In [8]:
#def check_author_name(text, name):
#    flag = 0
#    for word in text:
#        if name in word or word in name:
#            flag = flag + 1
#    return flag

#print(check_author_name(['becaus', 'it', ' s', 'illeg', 'in', 'our', 'realiti', 'vs ', 'the', 'propos', 'realiti', 'that', 'me', 'op', 'and', 'everyon', 'els', 'on', 'thi', 'thread', 'are', 'comment', 'about', 'keep', 'up', 'qwert'],"qwertx0815"))

In [9]:
# should return one feature vector for one string
# features -> Author's turn, length of argument, insults, sentiment
def gather_data(thread):
    returnObj = {}
    for i in range(len(thread["preceding_posts"])):
        comment_data = {}
        comment = thread["preceding_posts"][i]
        # clean text
        comment_data["text"] = stem_text(comment["body"])
        # Length just in case of Godwin's Law
        comment_data["char_length_vec"] = [len("".join(comment_data["text"]))]
        # refer's to other user/author
        #comment_data["refer_author"] = [check_author_name(comment_data["text"], thread["preceding_posts"][1-i]["author_name"])]
        # check for some common insults
        comment_data["insults_vec"] = [count_insults(comment_data["text"])]
        # get sentiment
        sentiment = TextBlob(' '.join(comment_data["text"])).sentiment
        comment_data["sentiment"] =  [sentiment.polarity, sentiment.subjectivity]
        feature_vec = comment_data["char_length_vec"] + comment_data["insults_vec"] + comment_data["sentiment"]
        
        returnObj[" ".join(comment_data["text"])] = feature_vec
        
    return returnObj

In [10]:
# print features for some 10 tuples
for thread in train_data[:5]:
    print(gather_data(thread))
    print()

{'becaus it  s illeg in our realiti vs  the propos realiti that me op and everyon els on thi thread are comment about keep up qwert': [104, 0, 0.0, 0.0], 'i live in a nation were it is complet legal to pay for sex surround by other nation where it is also mostli legal guess what no woman pay for sex and you  ll notic that it be illeg in the us doe n t stop men from go to prostitut keep up bunchanumb edit for context it  s a first world nation with consider more liber attitud regard sex then the us': [276, 0, 0.19356060606060607, 0.3513888888888889]}

{'becaus make prostitut legal make it veri much more difficult to polic thi as ha been found repeatedli in countri around the world the women who are brought into the countri ca n t be question becaus there is no legal justif to do so as the job they are do are legal thi is what  s been found everywher from nevada to germani to australia': [273, 0, 0.059999999999999984, 0.42000000000000004], 'i  d be interest in read up on thi do you have 

In [11]:
entire_dataset["train_prep"] = [gather_data(thread) for thread in tqdm(train_data)]

entire_dataset["val_prep"] = [gather_data(thread) for thread in tqdm(val_data)]

entire_dataset["test_prep"] = [gather_data(thread) for thread in tqdm(test_data)]

100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [02:02<00:00, 15.83it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:15<00:00, 16.73it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:15<00:00, 16.87it/s]


In [12]:
#print(len(train_bow.toarray().tolist()[1]))

In [13]:
def get_delta_in_sentences(array, index, increasing_flag = False):
    values = [x[index] for x in array]
    ret_answer = values[0] < values[1]    
    if increasing_flag:
        return int(ret_answer)
    else:
        return int(not ret_answer)
    

In [61]:
def combine_vectors(ddict):
    feature_vectors = list(ddict.values())
    
    dot_product = dot(feature_vectors[0], feature_vectors[-1])
    norms_product = (norm(feature_vectors[0])*norm(feature_vectors[-1]))
    if norms_product == 0:
        cos_sim = 1
    else:
        cos_sim = dot_product/norms_product
    
    #cos_diff = 1 - cos_sim
    avg_insults = np.average([x[1] for x in feature_vectors])
    increasing_insults = get_delta_in_sentences(feature_vectors, 1, True)
    avg_polarity = np.average([x[-2] for x in feature_vectors])
    stddev_polarity = np.std([x[-2] for x in feature_vectors])
    avg_subjectivity = np.average([x[-1] for x in feature_vectors])
    stddev_subjectivity = np.std([x[-1] for x in feature_vectors])
    is_decreasing_polarity = get_delta_in_sentences(feature_vectors, -2)
    is_increasing_polarity = get_delta_in_sentences(feature_vectors, -2, True)
    is_decreasing_subjectivity = get_delta_in_sentences(feature_vectors, -1)
    is_increasing_subjectivity = get_delta_in_sentences(feature_vectors, -1, True)
    #kendall_correlation, _ = kendalltau(feature_vectors[0], feature_vectors[1])
    #print(ret_obj)
    return [avg_insults, increasing_insults, avg_polarity, stddev_polarity, avg_subjectivity, 
        stddev_subjectivity, is_decreasing_polarity, is_increasing_polarity, 
        is_decreasing_subjectivity, is_increasing_subjectivity, cos_sim]
    
#combine_vectors(entire_dataset['train_prep'])

In [62]:
'''
A = stem_text("At this point it seems clear that we have very different definitions of bigotry, but I won't get into that because semantics are rarely a fruitful exercise.\n\nYou mean you won't get into it because then you would have to admit that you are defending the right of bigots to engage in prejudicial treatment of others, and that is morally indefensible.\n\n>It's not that I think life should be fair to the \"bigot,\" it's that I will resist any attempt to introduce government coercion in the free market. People should be allowed to run businesses as they please, and if those businesses are run by shitty people you can take your money elsewhere. If a business with horrible practices is able to thrive, that is more indicative of a problem with society, not just the business owner.\n\nThis is a disgusting attitude that glorifies those with power over those who have none.  This is the heart of fascism: power to the strong, fuck the weak.\n\nI sincerely hope that you are made the victim of prejudice and bigotry, so that you can one day understand how disgusting and malignant your beliefs are.")
B = stem_text("his is a disgusting attitude that glorifies those with power over those who have none. This is the heart of fascism\n\nOkay, so now we're getting into ad hominems. Got it. I find it interesting how I'm the one denouncing creeping authoritarianism and you're the one calling that fascist. I'm not convinced you know what fascism actually means.\n\n>You mean you won't get into it because then you would have to admit that you are defending the right of bigots to engage in prejudicial treatment of others, and that is morally indefensible.\n\nBigotry (noun): intolerance to those who hold different opinions from oneself.\n\n>I sincerely hope that you are made the victim of prejudice and bigotry, so that you can one day understand how disgusting and malignant your beliefs are.\n\nHmmm... Wishing ill will upon those who are different from you? Sounds a lot like how you described those bakers refusing to bake a cake for gay couples. Also seems to fit nicely into that definition of bigotry I listed above.\n\nBigotry comes in many forms. Thinking that anyone who isn't a neo-progressive liberal such as yourself is a bigot is one of those forms. I sincerely hope you take some time to reflect on how you view others with whom you disagree, because I can't see how anyone with so much animosity could possibly be happy.\n\nEnjoy your weekend! The weather's gorgeous here, I hope it's nice where you are too.")

print(TextBlob(" ".join(A)).sentiment)
print(TextBlob(" ".join(B)).sentiment)

print(TextBlob("fuck").tags)
print(TextBlob("should").tags)
print(TextBlob("are").tags)
print(TextBlob("idiot").tags)
print(TextBlob("stupid").tags)
print(TextBlob("can").tags)
print(TextBlob("you").tags)
print(TextBlob("your").tags)
print(TextBlob("you're").tags)
'''

'\nA = stem_text("At this point it seems clear that we have very different definitions of bigotry, but I won\'t get into that because semantics are rarely a fruitful exercise.\n\nYou mean you won\'t get into it because then you would have to admit that you are defending the right of bigots to engage in prejudicial treatment of others, and that is morally indefensible.\n\n>It\'s not that I think life should be fair to the "bigot," it\'s that I will resist any attempt to introduce government coercion in the free market. People should be allowed to run businesses as they please, and if those businesses are run by shitty people you can take your money elsewhere. If a business with horrible practices is able to thrive, that is more indicative of a problem with society, not just the business owner.\n\nThis is a disgusting attitude that glorifies those with power over those who have none.  This is the heart of fascism: power to the strong, fuck the weak.\n\nI sincerely hope that you are made 

In [63]:
x_train = [combine_vectors(thread) for thread in tqdm(entire_dataset['train_prep'])]
x_val = [combine_vectors(thread) for thread in tqdm(entire_dataset['val_prep'])]
x_test = [combine_vectors(thread) for thread in tqdm(entire_dataset['test_prep'])]

100%|████████████████████████████████████████████████████████████████████████████| 1936/1936 [00:00<00:00, 7520.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 258/258 [00:00<00:00, 7588.20it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 258/258 [00:00<00:00, 7840.96it/s]


In [64]:
y_train = entire_dataset["train_label"]
y_val = entire_dataset["val_label"]
y_test = entire_dataset["test_label"]

In [65]:
clf = SVC()
clf.fit(x_train,y_train)

SVC()

In [66]:
val_pred = clf.predict(x_val)
test_pred = clf.predict(x_test)

In [67]:
print("Accuracy for val data:",metrics.accuracy_score(y_val, val_pred))
print("Accuracy for test data:",metrics.accuracy_score(y_test, test_pred))

print("Precision val:",metrics.precision_score(y_val, val_pred))
print("Precision test:",metrics.precision_score(y_test, test_pred))

print("Recall val:",metrics.recall_score(y_val, val_pred))
print("Recall test:",metrics.recall_score(y_test, test_pred))

print("F1 score val:",metrics.f1_score(y_val, val_pred))
print("F1 score test:",metrics.f1_score(y_test, test_pred))

Accuracy for val data: 0.5930232558139535
Accuracy for test data: 0.5930232558139535
Precision val: 0.5833333333333334
Precision test: 0.5833333333333334
Recall val: 0.6511627906976745
Recall test: 0.6511627906976745
F1 score val: 0.6153846153846155
F1 score test: 0.6153846153846155


In [20]:

# random classification for baseline score

#random_val = {t_id: random.randint(0,1) for t_id in val_ids}
#random_test = {t_id: random.randint(0,1) for t_id in test_ids}


In [21]:
#with open("val-random-output.json", "w") as f:
#    json.dump(random_val, f)
# get testing dataset 
#with open("test-random-output.json", "w") as f:
#    json.dump(random_test, f)
