In [1]:

#import statements
import json
import keras
import numpy as np
import random
import re
import spacy

from nltk.stem import PorterStemmer
from numpy import dot
from numpy.linalg import norm
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from textblob import TextBlob
from tqdm import tqdm

In [2]:
# get training dataset 
with open("train-data-prepared.json", "r") as f:
    train_data = json.load(f)
# get training dataset 
with open("val-data-prepared.json", "r") as f:
    val_data = json.load(f)
# get testing dataset 
with open("val-data-prepared.json", "r") as f:
    test_data = json.load(f)


In [3]:
#create spacy object
nlp_english = spacy.load("en_core_web_sm")
#create Stemmer object
stemmer = PorterStemmer()

In [4]:
entire_dataset = {
    'train_ids': [thread["id"] for thread in train_data],
    'train_posts': [thread["preceding_posts"] for thread in train_data],
    'train_label': [thread["label"] for thread in train_data],
    
    'val_ids': [thread["id"] for thread in val_data],
    'val_posts': [thread["preceding_posts"] for thread in val_data],
    'val_label': [thread["label"] for thread in val_data],
    
    'test_ids': [thread["id"] for thread in test_data],
    'test_posts': [thread["preceding_posts"] for thread in test_data],
    'test_label': [thread["label"] for thread in test_data],
}

print(entire_dataset.keys())

dict_keys(['train_ids', 'train_posts', 'train_label', 'val_ids', 'val_posts', 'val_label', 'test_ids', 'test_posts', 'test_label'])


In [5]:
#for thread in val_data:
#    authors = []
#    for comment in thread["preceding_posts"]:
#        if comment["author_name"] not in authors:
#            authors.append(comment["author_name"])
#    
#    if len(authors) >= 2:
#        print(thread["id"])

In [6]:
# remove punctuation, space, urls from text
def clean_text(text):
    parsed_text = nlp_english(text)
    clean_text = []
    for token in parsed_text:
        stop_flag = (token.is_punct or token.is_space or  
                 token.like_url or token.is_stop)
        if (not stop_flag):
            clean_text.append(re.sub('[^A-Za-z0-9]+', ' ',token.text.lower()))
            
    return clean_text

def stem_text(text):
    return [stemmer.stem(word) for word in clean_text(text)]

print(stem_text("ass idiot fuck shit racist "))
print(clean_text("> a) right, because women are non-sexual creatures who would never use prostitutes themselves\n\ni think you vastly overestimate the number of women that pay for sex..."))

['ass', 'idiot', 'fuck', 'shit', 'racist']
[' ', 'right', 'women', 'non', 'sexual', 'creatures', 'use', 'prostitutes', 'think', 'vastly', 'overestimate', 'number', 'women', 'pay', 'sex']


In [7]:
# checks if the OP has addressed the other person in some way
def count_insults(text):
    insult_words = ["ass", "idiot", "fuck", "shit", "racist"]
    counter = 0
    for word in text:
        if word in insult_words:
            counter = counter + 1
            
    return counter
        
#print(check_insults(clean_text("> a) right, because women are non-sexual creatures who would never use prostitutes themselves\n\ni think you vastly overestimate the number of women that pay for sex...")))

In [8]:
# should return one feature vector for one string
# features -> Author's turn, length of argument, insults, sentiment
def gather_data(thread):
    returnObj = {}
    for i in range(len(thread["preceding_posts"])):
        comment_data = {}
        comment = thread["preceding_posts"][i]
        # clean text
        comment_data["text"] = stem_text(comment["body"])
        # author's identity: 
        # Assuming 3 dialogues, the authors will have indices 0,1,2
        # 0,2 would be the OP
        # 1 would be the reply
        #comment_data["author_turn_vec"] = [i % 2]
        # Length just in case of Godwin's Law
        comment_data["char_length_vec"] = [len("".join(clean_text(comment["body"])))]
        # check for some common insults
        comment_data["insults_vec"] = [count_insults(comment_data["text"])]
        # get sentiment
        sentiment = TextBlob(' '.join(comment_data["text"])).sentiment
        comment_data["sentiment"] =  [sentiment.polarity, sentiment.subjectivity]
        feature_vec = comment_data["char_length_vec"] + comment_data["insults_vec"] + comment_data["sentiment"]
        
        returnObj[" ".join(comment_data["text"])] = feature_vec
        
    return returnObj

In [9]:
# print features for some 10 tuples
for thread in train_data[:5]:
    print(gather_data(thread))
    print()

{'  right women non sexual creatur use prostitut think vastli overestim number women pay sex': [83, 0, 0.39285714285714285, 0.6845238095238095], 'illeg realiti vs  propos realiti op thread comment qwert': [55, 0, 0.0, 0.0], 'live nation complet legal pay sex surround nation legal guess woman pay sex notic illeg stop men go prostitut bunchanumb edit context world nation consider liber attitud sex': [171, 0, 0.1787878787878788, 0.3]}

{'make prostitut legal make difficult polic found repeatedli countri world women brought countri question legal justif job legal found nevada germani australia': [155, 0, 0.02500000000000001, 0.39999999999999997], 'interest read sourc exemplifi refer cop question ask immigr statu warrant voluntari': [91, 0, 0.0, 0.0]}

{'specif elect presid focu discuss unfortun find book read subject have move call earli decad earli year read american histori class colleg come today cite passag deal issu   justifi use elector colleg hamilton focus argument deal colleg oppo

In [10]:
entire_dataset["train_prep"] = [gather_data(thread) for thread in tqdm(train_data)]

entire_dataset["val_prep"] = [gather_data(thread) for thread in tqdm(val_data)]

entire_dataset["test_prep"] = [gather_data(thread) for thread in tqdm(test_data)]

100%|██████████████████████████████████████████████████████████████████████████████| 1936/1936 [08:39<00:00,  3.72it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [01:01<00:00,  4.18it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 258/258 [01:01<00:00,  4.20it/s]


In [11]:
'''
def concatAllStringForBoW(listOfDict):
    return_obj = []
    for d_dict in listOfDict:
        return_obj.append(" ".join(list(d_dict.keys())))
        
    return return_obj      

print(len(concatAllStringForBoW(entire_dataset["train_prep"])))
'''

'\ndef concatAllStringForBoW(listOfDict):\n    return_obj = []\n    for d_dict in listOfDict:\n        return_obj.append(" ".join(list(d_dict.keys())))\n        \n    return return_obj      \n\nprint(len(concatAllStringForBoW(entire_dataset["train_prep"])))\n'

In [12]:
'''
train_bow_input = concatAllStringForBoW(entire_dataset["train_prep"])
val_bow_input = concatAllStringForBoW(entire_dataset["val_prep"])
test_bow_input = concatAllStringForBoW(entire_dataset["test_prep"])

print(len(train_bow_input))
'''

'\ntrain_bow_input = concatAllStringForBoW(entire_dataset["train_prep"])\nval_bow_input = concatAllStringForBoW(entire_dataset["val_prep"])\ntest_bow_input = concatAllStringForBoW(entire_dataset["test_prep"])\n\nprint(len(train_bow_input))\n'

In [13]:
'''
vectorizer = CountVectorizer()

train_bow = vectorizer.fit_transform(train_bow_input)
val_bow = vectorizer.transform(val_bow_input)
test_bow = vectorizer.transform(test_bow_input)
'''

'\nvectorizer = CountVectorizer()\n\ntrain_bow = vectorizer.fit_transform(train_bow_input)\nval_bow = vectorizer.transform(val_bow_input)\ntest_bow = vectorizer.transform(test_bow_input)\n'

In [14]:
#print(len(train_bow.toarray().tolist()[1]))

In [15]:
def get_delta_in_sentences(array, index, increasing_flag = False):
    values = [x[index] for x in array]
    if len(values) == 2:
        ret_answer = values[0] < values[1]
    else:
        ret_answer1 = values[0] < values[1]
        ret_answer2 = values[1] < values[2]
        ret_answer = ret_answer1 & ret_answer2
        
    if increasing_flag:
        return int(ret_answer)
    else:
        return int(not ret_answer)
    

In [16]:
print(get_delta_in_sentences([[1,2,3,5],[10,20,2,3],[7,22,1,4]],-1,True))

0


In [17]:
def combine_vectors(ddict):
    feature_vectors = list(ddict.values())
    dot_product = dot(feature_vectors[0], feature_vectors[-1])
    norms_product = (norm(feature_vectors[0])*norm(feature_vectors[-1]))
    
    if norms_product == 0:
        cos_sim = 1
    else:
        cos_sim = dot_product/norms_product
    
    cos_diff = 1 - cos_sim
    is_author_turn_next = 1 - (len(feature_vectors)%2)
    count_dialogues = len(feature_vectors)
    avg_length = np.average([x[1] for x in feature_vectors])
    insults_count = np.sum([x[1] for x in feature_vectors])
    increasing_insults = get_delta_in_sentences(feature_vectors, 1, True)
    avg_polarity = np.average([x[-2] for x in feature_vectors])
    stddev_polarity = np.std([x[-2] for x in feature_vectors])
    avg_subjectivity = np.average([x[-1] for x in feature_vectors])
    stddev_subjectivity = np.std([x[-1] for x in feature_vectors])
    is_decreasing_polarity = get_delta_in_sentences(feature_vectors, -2)
    is_increasing_polarity = get_delta_in_sentences(feature_vectors, -2, True)
    is_decreasing_subjectivity = get_delta_in_sentences(feature_vectors, -1)
    is_increasing_subjectivity = get_delta_in_sentences(feature_vectors, -1, True)
    #print(ret_obj)
    return [is_author_turn_next, count_dialogues, avg_length, 
        avg_polarity, stddev_polarity, avg_subjectivity, 
        stddev_subjectivity, is_decreasing_polarity, is_increasing_polarity, 
        is_decreasing_subjectivity, is_increasing_subjectivity, cos_diff, cos_sim]
    
#combine_vectors(entire_dataset['train_prep'])

In [18]:
x_train = [combine_vectors(thread) for thread in tqdm(entire_dataset['train_prep'])]
x_val = [combine_vectors(thread) for thread in tqdm(entire_dataset['val_prep'])]
x_test = [combine_vectors(thread) for thread in tqdm(entire_dataset['test_prep'])]

100%|████████████████████████████████████████████████████████████████████████████| 1936/1936 [00:00<00:00, 2517.74it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 258/258 [00:00<00:00, 3785.02it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 258/258 [00:00<00:00, 5911.12it/s]


In [19]:
print(len(x_train))

1936


In [20]:
#x_train2 = [x_train[i] + train_bow.toarray().tolist()[i] for i in tqdm(range(len(x_train)))]
#x_val2 = [x_val[i] + val_bow.toarray().tolist()[i]  for i in tqdm(range(len(x_val)))]
#x_test2 = [x_test[i] + test_bow.toarray().tolist()[i] for i in tqdm(range(len(x_test)))]

#print(x_train2)

In [21]:
y_train = entire_dataset["train_label"]
y_val = entire_dataset["val_label"]
y_test = entire_dataset["test_label"]

In [22]:
print(len(x_train[1]))
print(len(x_train))

13
1936


In [23]:
clf = SVC(verbose = True)
clf.fit(x_train,y_train)

[LibSVM]

SVC(verbose=True)

In [24]:
val_pred = clf.predict(x_val)
test_pred = clf.predict(x_test)

In [25]:
print("Accuracy for val data:",metrics.accuracy_score(y_val, val_pred))
print("Accuracy for test data:",metrics.accuracy_score(y_test, test_pred))

print("Precision val:",metrics.precision_score(y_val, val_pred))
print("Precision test:",metrics.precision_score(y_test, test_pred))

print("Recall val:",metrics.recall_score(y_val, val_pred))
print("Recall test:",metrics.recall_score(y_test, test_pred))

print("F1 score val:",metrics.f1_score(y_val, val_pred))
print("F1 score test:",metrics.f1_score(y_test, test_pred))

Accuracy for val data: 0.9922480620155039
Accuracy for test data: 0.9922480620155039
Precision val: 1.0
Precision test: 1.0
Recall val: 0.9844961240310077
Recall test: 0.9844961240310077
F1 score val: 0.9921875
F1 score test: 0.9921875


In [26]:

# random classification for baseline score

#random_val = {t_id: random.randint(0,1) for t_id in val_ids}
#random_test = {t_id: random.randint(0,1) for t_id in test_ids}


In [27]:
#with open("val-random-output.json", "w") as f:
#    json.dump(random_val, f)
# get testing dataset 
#with open("test-random-output.json", "w") as f:
#    json.dump(random_test, f)
