In [7]:
import re
import numpy as np
import pandas as pd
from gensim import utils
from markdown import markdown
from bs4 import BeautifulSoup
from sklearn.svm import LinearSVC
from sklearn import preprocessing
from sklearn.model_selection import KFold
import gensim.parsing.preprocessing as gsp
from sklearn.feature_extraction import text
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import (
    make_scorer,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)
import random
import torch
import transformers as ppb
import warnings
import scipy.sparse
from scipy.sparse import hstack
warnings.filterwarnings('ignore')


Randomly picks data from the files. Used for testing purposes as full data can't be run on machine. 

In [74]:
p = 0.35
r = 0.1
q = 0.001
s = 0.001



train_df = pd.read_csv("../data/filtered_typology_final.csv", skiprows=lambda i: i>0 and random.random() > p)
wow_unlabeled = pd.read_csv("../data/wow_uncoded_rules_codifying.csv", skiprows=lambda i: i>0 and random.random() > r)
minecraft_unlabeled = pd.read_csv("../data/minecraft_uncoded_rules_codifying.csv", skiprows=lambda i: i>0 and random.random() > q)
reddit_unlabeled = pd.read_csv("../data/reddit_uncoded_rules_codifying.csv", skiprows=lambda i: i>0 and random.random() > s)

unlabeled_list = ["text", "communityID", "domain"]
wow_unlabeled, minecraft_unlabeled, reddit_unlabeled = (
    wow_unlabeled[unlabeled_list],
    minecraft_unlabeled[unlabeled_list],
    reddit_unlabeled[unlabeled_list],
)

Picks out texts if the length of that text is less than 80, and places back into dataframe.

In [75]:

train_df = train_df[train_df['text'].apply(lambda x: len(x) <= 80)]
wow_unlabeled = wow_unlabeled[wow_unlabeled['text'].apply(lambda x: len(x) <= 80)]
minecraft_unlabeled = minecraft_unlabeled[minecraft_unlabeled['text'].apply(lambda x: len(x) <= 80)]
reddit_unlabeled = reddit_unlabeled[reddit_unlabeled['text'].apply(lambda x: len(x) <= 80)]



In [76]:
print("Shape of training data: ", train_df.shape)
print("Shape of wow_unlabeled: ", wow_unlabeled.shape)
print("Shape of minecraft_unlabeled: ", minecraft_unlabeled.shape)
print("shape of reddit_unlabeled: ", reddit_unlabeled.shape)


Shape of training data:  (4349, 24)
Shape of wow_unlabeled:  (539, 3)
Shape of minecraft_unlabeled:  (296, 3)
shape of reddit_unlabeled:  (547, 3)


## Data preprocessing
- remove useless characters, whitespace, stopwords  
- lowercasing 
- stemming 

In [47]:
def strip_html_markdown(s):
    if type(s) not in [int, float] and s is not None:
        return (
            " ".join(
                re.split(
                    "[ _<>,.!|:#*\n\[\]\?]+",
                    " ".join(
                        BeautifulSoup(markdown(s), "html.parser").findAll(text=True)
                    ),
                )
            )
            .lower()
            .strip()
        )


def whitespace_removal(df):
    df.rule_norm_strategy = df.rule_norm_strategy.apply(lambda x: x.strip())
    df.reg_const = df.reg_const.apply(lambda x: x.strip())
    df.domain = df.domain.apply(lambda x: x.strip())
    return df


filters = [
    gsp.strip_tags,
    gsp.strip_punctuation,
    gsp.strip_multiple_whitespaces,
    gsp.strip_numeric,
    gsp.remove_stopwords,
    gsp.strip_short,
    gsp.stem_text,
]


def clean_text(s):
    if type(s) not in [int, float] and s is not None:
        s = s.lower()
        s = utils.to_unicode(s)
        for f in filters:
            s = f(s)
        return s


def randomShuffle(df):
    return df.sample(frac=1).reset_index(drop=True)



def corpusGen(df):
    return (
        df.text.apply(strip_html_markdown)
        .apply(lambda x: clean_text(x))
        .astype(str)
        .tolist()
    )


## Data transformation
- generate corpus 
- transform the corpus to a normalized tf-idf representation

In [48]:
def corpusGen(df):
    return (
        df.text.apply(strip_html_markdown)
        .apply(lambda x: clean_text(x))
        .astype(str)
        .tolist()
    )


train_df = randomShuffle(whitespace_removal(train_df))
IS_corpus = corpusGen(train_df)
wow_corpus = corpusGen(wow_unlabeled)
minecraft_corpus = corpusGen(minecraft_unlabeled)
reddit_corpus = corpusGen(reddit_unlabeled)

vectorizer = TfidfVectorizer(ngram_range=(1, 3), stop_words=None)
X_IS = vectorizer.fit_transform(IS_corpus)

In [49]:
def get_precision_recall_f1(l, category, accuracy):
    precision_recall_df = pd.DataFrame(
        l, columns=["Precision", "Recall", "F1 Score", "Support"]
    )
    precision_recall_df.drop("Support", axis=1, inplace=True)
    precision_recall_df.insert(0, "Type", category)
    precision_recall_df.insert(1, "Accuracy", accuracy)
    return precision_recall_df

In [50]:
model_class, tokenizer_class, pretrained_weights = (ppb.DistilBertModel, ppb.DistilBertTokenizer, 'distilbert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

Helper functions for BERT implmentation:
- padded_token_Generator : tokenizes and pads values to the same length
- BERT_feature_Gen : Generates BERT features and returns features
- Tfidf_BERT_Combine : Combines tfidf features with bert features, returns sparse matrix.

In [51]:
def returnDataframe(l):
    temp = pd.DataFrame(l)
    temp = temp.rename(columns =  {0:'text'})
    return temp

def padded_token_Generator(df):
    token = df['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))
    max_len = 0
    for i in token.values:
        if len(i) > max_len:
            max_len = len(i)
            padded = np.array([i + [0]*(max_len-len(i)) for i in token.values])
    return padded 

def BERT_feature_Gen(padded_tokens):
    attention_mask = np.where(padded_tokens != 0, 1, 0)
    input_ids = torch.tensor(np.array(padded_tokens)).long()
    attention_mask = torch.tensor(attention_mask)
    with torch.no_grad():
        last_hidden_states = model(input_ids)
    features = last_hidden_states[0][:,0,:].numpy()
    return features

def Tfidf_BERT_Combine(tfidf_features,bert_features):
    features_Matrix = scipy.sparse.csr_matrix(bert_features)
    combined = hstack([tfidf_features,features_Matrix])
    return combined



# BERT Implementaion

In [52]:
temp_X_IS = returnDataframe(IS_corpus)
temp_wow_corpus = returnDataframe(wow_corpus)
temp_minecraft_corpus = returnDataframe(minecraft_corpus)
temp_reddit_corpus = returnDataframe(reddit_corpus)

Tokenizing Data for BERT feature generation

In [53]:
tokenized = padded_token_Generator(temp_X_IS)
tokenized_wow = padded_token_Generator(temp_wow_corpus)
tokenized_mine = padded_token_Generator(temp_minecraft_corpus)
tokenized_reddit = padded_token_Generator(temp_reddit_corpus)


In [54]:
X_IS_BERT = BERT_feature_Gen(tokenized)
X_IS_Combined = Tfidf_BERT_Combine(X_IS,X_IS_BERT)

In [55]:
print("BERT features shape: ", X_IS_BERT.shape)
print("TF-IDF features shape: ", X_IS.shape)
print("Combined features shape: ", X_IS_Combined.shape)

BERT features shape:  (21, 768)
TF-IDF features shape:  (21, 159)
Combined features shape:  (21, 927)


In [56]:
l_IS = []
acc_IS = []
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
y_IS = train_df.IS.values.tolist()
clf_IS = OneVsRestClassifier(LinearSVC())
accuracy = cross_val_score(clf_IS, X_IS_Combined, y_IS, cv=kfold).mean()
clf_IS.fit(X_IS_Combined, y_IS)
y_IS_pred = cross_val_predict(clf_IS, X_IS_Combined, y_IS, cv=kfold)
acc_IS.append(accuracy)
l_IS.append(precision_recall_fscore_support(y_IS, y_IS_pred, average="weighted"))
IS_score = get_precision_recall_f1(l_IS, "IS", acc_IS)


In [57]:
wow_BERT = BERT_feature_Gen(tokenized_wow)
mine_BERT = BERT_feature_Gen(tokenized_mine)
reddit_BERT = BERT_feature_Gen(tokenized_reddit)

In [58]:
IS_X_wow, IS_X_minecraft, IS_X_reddit = (
    vectorizer.transform(wow_corpus),
    vectorizer.transform(minecraft_corpus),
    vectorizer.transform(reddit_corpus),
)

WOW_X_IS = Tfidf_BERT_Combine(IS_X_wow,wow_BERT)
Minecraft_X_IS = Tfidf_BERT_Combine(IS_X_minecraft,mine_BERT)
Reddit_IS = Tfidf_BERT_Combine(IS_X_reddit,reddit_BERT)


In [59]:
print("BERT features shape: ", wow_BERT.shape)
print("TF-IDF features shape: ", IS_X_wow.shape)
print("Comined features shape: ", WOW_X_IS.shape)

print("BERT features shape: ", mine_BERT.shape)
print("TF-IDF features shape: ", IS_X_minecraft.shape)
print("Comined features shape: ", Minecraft_X_IS.shape)

print("BERT features shape: ", reddit_BERT.shape)
print("TF-IDF features shape: ", IS_X_reddit.shape)
print("Comined features shape: ", Reddit_IS.shape)


BERT features shape:  (47, 768)
TF-IDF features shape:  (47, 159)
Comined features shape:  (47, 927)
BERT features shape:  (24, 768)
TF-IDF features shape:  (24, 159)
Comined features shape:  (24, 927)
BERT features shape:  (53, 768)
TF-IDF features shape:  (53, 159)
Comined features shape:  (53, 927)


Predicting on wow, minecraft, and reddit data using BERT and TFIDF Features

In [60]:
wow_unlabeled["IS"] = clf_IS.predict(WOW_X_IS)
minecraft_unlabeled["IS"] = clf_IS.predict(Minecraft_X_IS)
reddit_unlabeled["IS"] = clf_IS.predict(Reddit_IS)

coded_df = train_df[unlabeled_list].copy()
coded_df["IS"] = clf_IS.predict(X_IS_Combined)

In [61]:
train_df = train_df[
    (train_df["IS"] == 1)
    & (train_df["domain"] == "reddit")
    & (train_df["reg_const"] != "none")
]

wow_unlabeled = wow_unlabeled[wow_unlabeled["IS"] == 1]
minecraft_unlabeled = minecraft_unlabeled[minecraft_unlabeled["IS"] == 1]
reddit_unlabeled = reddit_unlabeled[reddit_unlabeled["IS"] == 1]
coded_df = coded_df[coded_df["IS"] == 1]

In [62]:
rules_wow_corpus = corpusGen(wow_unlabeled)
rules_minecraft_corpus = corpusGen(minecraft_unlabeled)
rules_reddit_corpus = corpusGen(reddit_unlabeled)

rules_corpus = corpusGen(train_df)
rules_coded_corpus = corpusGen(coded_df)


In [63]:
X_rules = vectorizer.fit_transform(rules_corpus)
X_coded = vectorizer.transform(rules_coded_corpus)
X_wow = vectorizer.transform(rules_wow_corpus)
X_minecraft = vectorizer.transform(rules_minecraft_corpus)
X_reddit = vectorizer.transform(rules_reddit_corpus)


Tokenizing rules to prep for BERT Feature generation

In [64]:
temp_X_rules = returnDataframe(rules_corpus)
temp_X_coded = returnDataframe(rules_coded_corpus)
temp_X_wow = returnDataframe(rules_wow_corpus)
temp_X_minecraft = returnDataframe(rules_minecraft_corpus)
temp_X_reddit = returnDataframe(rules_reddit_corpus)


tokenized_X_rules = padded_token_Generator(temp_X_rules)
tokenized_X_coded = padded_token_Generator(temp_X_coded)
tokenized_X_wow = padded_token_Generator(temp_X_wow)
tokenized_X_minecraft = padded_token_Generator(temp_X_minecraft)
tokenized_X_reddit = padded_token_Generator(temp_X_reddit)


Generating features for BERT and combining with TF-IDF

In [65]:

X_rules_BERT = BERT_feature_Gen(tokenized_X_rules)
X_coded_BERT = BERT_feature_Gen(tokenized_X_coded)
X_wow_BERT = BERT_feature_Gen(tokenized_X_wow)
X_minecraft_BERT = BERT_feature_Gen(tokenized_X_minecraft)
X_reddit_BERT = BERT_feature_Gen(tokenized_X_reddit)


X_rules_Combined = Tfidf_BERT_Combine(X_rules,X_rules_BERT)
X_coded_Combined = Tfidf_BERT_Combine(X_coded,X_coded_BERT)
X_wow_Combined = Tfidf_BERT_Combine(X_wow,X_wow_BERT)
X_minecraft_Combined = Tfidf_BERT_Combine(X_minecraft,X_minecraft_BERT)
X_reddit_Combined = Tfidf_BERT_Combine(X_reddit,X_reddit_BERT)




In [66]:
l_rules = []
acc_rules = []
categories = [
    "reg_const",
    "rule_norm_strategy",
    "position_type",
    "boundary_type",
    "aggregation_type",
    "payoff_type",
    "information_type",
    "communication_type",
    "choice_type",
    "scope_type",
]

clf_rules = OneVsRestClassifier(LinearSVC())

for c in categories:
    y_rules = train_df[c].values.tolist()
    accuracy_rules = cross_val_score(clf_rules, X_rules_Combined, y_rules, cv=kfold).mean()
    y_rules_pred = cross_val_predict(clf_rules, X_rules_Combined, y_rules, cv=kfold)
    acc_rules.append(accuracy_rules)
    l_rules.append(
        precision_recall_fscore_support(y_rules, y_rules_pred, average="weighted")
    )
    clf_rules.fit(X_rules_Combined, y_rules)

    wow_unlabeled[c] = clf_rules.predict(X_wow_Combined)
    minecraft_unlabeled[c] = clf_rules.predict(X_minecraft_Combined)
    reddit_unlabeled[c] = clf_rules.predict(X_reddit_Combined)
    coded_df[c] = clf_rules.predict(X_coded_Combined)

rules_scores = get_precision_recall_f1(l_rules, categories, acc_rules)
pd.concat([IS_score, rules_scores])

Unnamed: 0,Type,Accuracy,Precision,Recall,F1 Score
0,IS,0.75,0.721805,0.761905,0.741313
0,reg_const,0.8,0.714286,0.714286,0.714286
1,rule_norm_strategy,0.2,0.111111,0.142857,0.125
2,position_type,0.95,0.862245,0.928571,0.89418
3,boundary_type,1.0,1.0,1.0,1.0
4,aggregation_type,0.9,0.862245,0.928571,0.89418
5,payoff_type,1.0,1.0,1.0,1.0
6,information_type,0.9,0.857143,0.857143,0.857143
7,communication_type,0.95,0.862245,0.928571,0.89418
8,choice_type,0.5,0.375,0.5,0.428571


In [67]:
wow_unlabeled.to_csv("../output/wow_labeled.csv", index=False)
minecraft_unlabeled.to_csv("../output/minecraft_labeled.csv", index=False)
reddit_unlabeled.to_csv("../output/reddit_labeled.csv", index=False)
coded_df.to_csv("../output/coded_labeled.csv", index=False)

FileNotFoundError: [Errno 2] No such file or directory: '../output/wow_labeled.csv'