In [1]:
#import statements
import argparse
import json
import numpy as np
import re
import spacy

from better_profanity import profanity
from nltk.stem import PorterStemmer
from numpy import dot
from numpy.linalg import norm
from sklearn import metrics

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.svm import SVC
from textblob import TextBlob
from tqdm import tqdm

In [2]:
# get training dataset 
with open("train-data-prepared.json", "r") as f:
    train_data = json.load(f)
# get training dataset 
with open("val-data-prepared.json", "r") as f:
    val_data = json.load(f)
# get testing dataset 
with open("val-data-prepared.json", "r") as f:
    test_data = json.load(f)


In [3]:
#create spacy object
nlp_english = spacy.load("en_core_web_sm")
#create Stemmer object
stemmer = PorterStemmer()

In [4]:
entire_dataset = {
    'train_ids': [thread["id"] for thread in train_data],
    'train_posts': [thread["preceding_posts"] for thread in train_data],
    'train_label': [thread["label"] for thread in train_data],
    
    'val_ids': [thread["id"] for thread in val_data],
    'val_posts': [thread["preceding_posts"] for thread in val_data],
    'val_label': [thread["label"] for thread in val_data],
    
    'test_ids': [thread["id"] for thread in test_data],
    'test_posts': [thread["preceding_posts"] for thread in test_data],
    'test_label': [thread["label"] for thread in test_data],
}

print(entire_dataset.keys())

dict_keys(['train_ids', 'train_posts', 'train_label', 'val_ids', 'val_posts', 'val_label', 'test_ids', 'test_posts', 'test_label'])


In [5]:
#for thread in val_data:
#    authors = []
#    for comment in thread["preceding_posts"]:
#        if comment["author_name"] not in authors:
#            authors.append(comment["author_name"])
#    
#    if len(authors) >= 2:
#        print(thread["id"])

In [6]:
# remove punctuation, space, urls from text
def clean_text(text):
    parsed_text = nlp_english(text)
    clean_text = []
    for token in parsed_text:
        stop_flag = (token.is_punct or token.is_space or  
                 token.like_url)
        if (not stop_flag):
            clean_text.append(re.sub('[^A-Za-z0-9]+', ' ',token.text.lower()))
            
    return clean_text

def stem_text(text):
    return [stemmer.stem(word) for word in clean_text(text)]

#print(stem_text("ass idiot fuck shit racist "))
#print(stem_text("...because it's illegal in our reality, vs. the proposed reality that me, OP, and everyone else on this thread are commenting about.\n\nKeep up, qwert"))

In [7]:
# checks if the OP has addressed the other person in some way
def count_insults(text):
    insult_words = ["ass", "idiot", "fuck", "shit"]
    counter = 0
    for word in text:
        if word in insult_words:
            counter = counter + 1
            
    return counter
        
#print(check_insults(clean_text("> a) right, because women are non-sexual creatures who would never use prostitutes themselves\n\ni think you vastly overestimate the number of women that pay for sex...")))

In [8]:
#def check_author_name(text, name):
#    flag = 0
#    for word in text:
#        if name in word or word in name:
#            flag = flag + 1
#    return flag

#print(check_author_name(['becaus', 'it', ' s', 'illeg', 'in', 'our', 'realiti', 'vs ', 'the', 'propos', 'realiti', 'that', 'me', 'op', 'and', 'everyon', 'els', 'on', 'thi', 'thread', 'are', 'comment', 'about', 'keep', 'up', 'qwert'],"qwertx0815"))

In [9]:
def get_number(list_of_tuple, key):
    counter = 0
    for (word,tag) in list_of_tuple:
        if key in tag:
            counter = counter + 1
    return counter

#B = stem_text("this is a disgusting attitude that glorifies those with power over those who have none. This is the heart of fascism\n\nOkay, so now we're getting into ad hominems. Got it. I find it interesting how I'm the one denouncing creeping authoritarianism and you're the one calling that fascist. I'm not convinced you know what fascism actually means.\n\n>You mean you won't get into it because then you would have to admit that you are defending the right of bigots to engage in prejudicial treatment of others, and that is morally indefensible.\n\nBigotry (noun): intolerance to those who hold different opinions from oneself.\n\n>I sincerely hope that you are made the victim of prejudice and bigotry, so that you can one day understand how disgusting and malignant your beliefs are.\n\nHmmm... Wishing ill will upon those who are different from you? Sounds a lot like how you described those bakers refusing to bake a cake for gay couples. Also seems to fit nicely into that definition of bigotry I listed above.\n\nBigotry comes in many forms. Thinking that anyone who isn't a neo-progressive liberal such as yourself is a bigot is one of those forms. I sincerely hope you take some time to reflect on how you view others with whom you disagree, because I can't see how anyone with so much animosity could possibly be happy.\n\nEnjoy your weekend! The weather's gorgeous here, I hope it's nice where you are too.")

#print(TextBlob(" ".join(B)).tags)
#print(get_number(TextBlob(" ".join(B)).tags, 'NN'))
#print(get_number(TextBlob(" ".join(B)).tags, 'VBP'))
#print(get_number(TextBlob(" ".join(B)).tags, 'MD'))
#print(get_number(TextBlob(" ".join(B)).tags, 'PRP'))

In [10]:
# should return one feature vector for one string
# features -> Author's turn, length of argument, insults, sentiment
def gather_data(thread):
    returnObj = {}
    for i in range(len(thread["preceding_posts"])):
        comment_data = {}
        comment = thread["preceding_posts"][i]
        # clean text
        comment_data["text"] = stem_text(comment["body"])
        # length just in case of Godwin's Law
        comment_data["char_length_vec"] = [len("".join(comment_data["text"]))]
        # refer's to other user/author
        #comment_data["refer_author"] = [check_author_name(comment_data["text"], thread["preceding_posts"][1-i]["author_name"])]
        # check for some common insults
        comment_data["insults_vec"] = [count_insults(comment_data["text"])]
        # calculate number of POS tags
        sentence_tags = TextBlob(comment["body"]).tags
        comment_data["count_pos"] = [get_number(sentence_tags, 'NN'), get_number(sentence_tags, 'VBP'), 
                                     get_number(sentence_tags, 'MD'), get_number(sentence_tags, 'PRP')]
        # get sentiment
        sentiment = TextBlob(' '.join(comment_data["text"])).sentiment
        comment_data["sentiment"] =  [sentiment.polarity, sentiment.subjectivity]
        feature_vec = comment_data["char_length_vec"] + comment_data["insults_vec"] + comment_data["sentiment"] + comment_data["count_pos"]
        
        returnObj[" ".join(comment_data["text"])] = feature_vec
        
    return returnObj

In [11]:
# print features for some 10 tuples
#for thread in train_data[:5]:
#    print(gather_data(thread))
#    print()

In [12]:
entire_dataset["train_prep"] = [gather_data(thread) for thread in tqdm(train_data)]

entire_dataset["val_prep"] = [gather_data(thread) for thread in tqdm(val_data)]

entire_dataset["test_prep"] = [gather_data(thread) for thread in tqdm(test_data)]

100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [05:19<00:00,  6.07it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:38<00:00,  6.67it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [00:37<00:00,  6.80it/s]


In [13]:
#print(len(train_bow.toarray().tolist()[1]))

In [14]:
def concatAllStringForBoW(listOfDict):
    return_obj = []
    for d_dict in listOfDict:
        return_obj.append(" ".join(list(d_dict.keys())))
        
    return return_obj  

In [15]:
train_bow_input = concatAllStringForBoW(entire_dataset["train_prep"])
val_bow_input = concatAllStringForBoW(entire_dataset["val_prep"])
test_bow_input = concatAllStringForBoW(entire_dataset["test_prep"])

#print(len(train_bow_input))

In [16]:
vectorizer = CountVectorizer()
train_bow = vectorizer.fit_transform(train_bow_input).toarray().tolist()
val_bow = vectorizer.transform(val_bow_input).toarray().tolist()
test_bow = vectorizer.transform(test_bow_input).toarray().tolist()

#print(len(train_bow[0])) #12565
#print(len(train_bow)) #1936

In [18]:
def is_increased(array, index):
    values = [x[index] for x in array]
    ret_answer = values[0] < values[1]
    return int(ret_answer)
    

In [19]:
def combine_vectors(ddict):
    feature_vectors = list(ddict.values())
    
    (insults_index, polar_index) = (1,2)
    
    dot_product = dot(feature_vectors[0], feature_vectors[-1])
    norms_product = (norm(feature_vectors[0])*norm(feature_vectors[-1]))
    if norms_product == 0:
        cos_sim = 1
    else:
        cos_sim = dot_product/norms_product
    
    #cos_diff = 1 - cos_sim
    avg_insults = np.average([x[insults_index] for x in feature_vectors])
    is_increasing_insults = is_increased(feature_vectors, insults_index)
    
    avg_polarity = np.average([x[polar_index] for x in feature_vectors])
    is_increasing_polarity = is_increased(feature_vectors, polar_index)
    
    return [avg_insults, is_increasing_insults, avg_polarity, is_increasing_polarity,  cos_sim]
    
#combine_vectors(entire_dataset['train_prep'])

In [20]:
x_train = [combine_vectors(thread) for thread in tqdm(entire_dataset['train_prep'])]
x_val = [combine_vectors(thread) for thread in tqdm(entire_dataset['val_prep'])]
x_test = [combine_vectors(thread) for thread in tqdm(entire_dataset['test_prep'])]

100%|████████████████████████████████████████████████████████████████████████████| 1936/1936 [00:00<00:00, 9052.50it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 258/258 [00:00<00:00, 6448.78it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 258/258 [00:00<00:00, 10753.66it/s]


In [22]:
print(x_train[1])
print()

[0.0, 0, 0.029999999999999992, 0, 0.9995706912786162]



In [24]:
[x_train[i].extend(train_bow[i]) for i in tqdm(range(len(x_train))) ]
[x_val[i].extend(val_bow[i]) for i in tqdm(range(len(x_val))) ]
[x_test[i].extend(test_bow[i]) for i in tqdm(range(len(x_test))) ]

100%|████████████████████████████████████████████████████████████████████████████| 1936/1936 [00:00<00:00, 8161.23it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 258/258 [00:00<00:00, 6993.49it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 258/258 [00:00<00:00, 10746.30it/s]


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [None]:
#print(len(x_train_temp[1])) 12570
#print(len(x_train_temp)) 1936

In [25]:
y_train = entire_dataset["train_label"]
y_val = entire_dataset["val_label"]
y_test = entire_dataset["test_label"]

In [26]:
clf = SVC()
clf.fit(x_train,y_train)

SVC()

In [None]:
val_pred = clf.predict(x_val)
test_pred = clf.predict(x_test)

In [None]:
print("Accuracy for val data:",metrics.accuracy_score(y_val, val_pred))
print("Accuracy for test data:",metrics.accuracy_score(y_test, test_pred))

print("Precision val:",metrics.precision_score(y_val, val_pred))
print("Precision test:",metrics.precision_score(y_test, test_pred))

print("Recall val:",metrics.recall_score(y_val, val_pred))
print("Recall test:",metrics.recall_score(y_test, test_pred))

print("F1 score val:",metrics.f1_score(y_val, val_pred))
print("F1 score test:",metrics.f1_score(y_test, test_pred))

In [None]:

# random classification for baseline score

#random_val = {t_id: random.randint(0,1) for t_id in val_ids}
#random_test = {t_id: random.randint(0,1) for t_id in test_ids}


In [None]:
#with open("val-random-output.json", "w") as f:
#    json.dump(random_val, f)
# get testing dataset 
#with open("test-random-output.json", "w") as f:
#    json.dump(random_test, f)
