In [None]:
import gensim
import codecs
import json

from math import ceil

from gensim.models.keyedvectors import Word2VecKeyedVectors
from gensim.models import KeyedVectors
from gensim.models import FastText
from gensim.scripts.glove2word2vec import glove2word2vec

import collections
import copy
import itertools
import random
import sklearn

import re
import tqdm

import warnings
warnings.filterwarnings("ignore")

import importlib
import pickle
from collections import defaultdict, Counter
from typing import List, Dict

import torch
from torch import utils
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

import sklearn
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer 

import ref 

EPOCHSTEPS = 1000000
STOPWORDS = set(stopwords.words('english'))

In [None]:
#get embeddings 

def load_word_vectors(fname):
    model = KeyedVectors.load_word2vec_format(fname, limit=200000, binary=False)
    vecs = model.vectors
    words = list(model.vocab.keys())
    return model, vecs, words

word2vec, vecs, words = load_word_vectors("crawl-300d-2M.vec")

In [None]:
def sent_proc(txt, stopwords=[], lem=None):
    words = txt.split(" ")
    words = {re.sub(r'\W+', '', w).lower() for w in words \
             if re.sub(r'\W+', '', w).lower() not in stopwords}
    if lem is not None: 
        words = {lem.lemmatize(w) for w in words}
    return words

# Data Cleaning

In [None]:
class ToxicityDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, data, transform=None):
        """
        Args:
            data (pd dataframe): The pd dataframe with (uid, tox_label, text)
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.dataset = data
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sample = self.dataset.loc[idx, 'comment_text']
        if self.transform:
            sample = self.transform(sample)
        
        target = self.dataset.loc[idx, 'toxicity']
        
        return {'x':sample, 'y':np.expand_dims(target.values, axis=1)} #Fix this without hack
    
class GetEmbedding(object):
    """Given a sentence of text, generate sentence embedding

    Args:
        model: word embedding model, dictionary of wrods -> embeds
    """

    def __init__(self, model, stopwords=[]):
        self.model = model
        self.stopwords = stopwords
        self.unknown_embed = np.zeros(300)  #NOTE - this may not be same for all WEs 
        
    def __call__(self, sample):
        ''':param sample: pd.Series'''
        if type(sample) == str:
            words = sample.split(" ")
            words = [w for w in words if w.lower() not in self.stopwords]
            sent_embedding = np.sum([self.model[w] if w in self.model else self.unknown_embed for w in words], axis = 0)    
        elif type(sample) == pd.Series:
            sent_embedding = np.zeros((len(sample), 300))
            for i, txt in enumerate(sample):
                words = txt.split(" ")
                words = [w for w in words if w.lower() not in self.stopwords]
                sent_embedding[i, :] = np.sum([self.model[w] if w in self.model else self.unknown_embed for w in words], axis = 0)   
        
        return sent_embedding
    
class GetBOW(object):
    """Given a sentence of text, generate BOW rep

    Args:
        vocab: dictionary, word-->index in array (assume contigious) """

    def __init__(self, vocab, lem=None, stopwords=[]):
        self.vocab = vocab
        self.stopwords = stopwords
        self.lem = lem
        
    def __call__(self, sample):
        ''':param sample: str or pd.Series'''
        def get_rep(txt):
            rep = np.zeros(len(self.vocab))
            words = sent_proc(txt, stopwords=self.stopwords, lem=self.lem)
            for w in words:
                try:
                    rep[self.vocab[w]] = 1
                except KeyError:
                    continue
            return rep
        
        if type(sample) == str:
            bow_embed = get_rep(sample)
        elif type(sample) == pd.Series:
            bow_embed = np.zeros((len(sample), len(self.vocab)))
            for i, txt in enumerate(sample):
                bow_embed[i, :] = get_rep(txt)
                
        return bow_embed

#Data Functions
def generate_dataset(d, elg, t=None):
    full = d[['id', 'toxicity', 'comment_text']]
    full = full[elg]
    
    #convert to pytorch formatting
    full = ToxicityDataset(full, transform=t)
    return full

def generate_dataloader(d, elg, nbatch=1, t=None):
    full = generate_dataset(d, elg, t=t)
    full = DataLoader(full, ceil(len(full)/nbatch), shuffle=False)
    return full

In [None]:
full_data = pd.read_csv('all_data.csv')
print(full_data.columns)

# Baseline

In [None]:
thresh = 0.2
#Data Cleaning 
full_data['LGTBQ'] = full_data[['homosexual_gay_or_lesbian', 'bisexual', 'other_sexual_orientation']].max(axis=1)

full_partition = full_data[(full_data['LGTBQ'] > 0)]
toxic, non_toxic = full_partition[full_data['toxicity'] >= thresh].sample(frac=1).reset_index(drop=True), \
                        full_partition[full_data['toxicity'] < thresh].sample(frac=1).reset_index(drop=True)
toxic['toxicity'], non_toxic['toxicity'] = toxic['toxicity'].apply((lambda x: 1 if x > thresh else 0)), \
                    non_toxic['toxicity'].apply((lambda x: 1 if x > thresh else 0))

totals = {'nt':len(non_toxic), 't':len(toxic)}
env_splits = np.array([[.1, 0.9], [0.2, 0.8], [0.9, 0.1]]) 
weights = {'nt':env_splits.mean(axis=0)[0], 't':env_splits.mean(axis=0)[1]}

#Adjust so that desired env splits possible

if float(totals['t']/(totals['t'] + totals['nt'])) >= weights['t']:  #see who has the bigger proportion 
    ns = int(totals['nt']/weights['nt'] - totals['nt'])   #     int((len(full_partition) - weights['nt']*totals['nt'])/weights['t'])
    toxic = toxic.sample(n=ns)
else:
    ns = int(totals['t']/weights['t'] - totals['t']) 
    non_toxic = non_toxic.sample(n=ns)

In [None]:
#partition env splits
nenvs = env_splits.shape[0]
e_props = env_splits/env_splits.sum(axis=0) #proprotion of vector in each env

env_partitions = []  #Note - last env is the test env
for i in range(nenvs):  #Note - tehre might be an error here that excludes  single sample from diff envs 
    #Get both componenets of envs 
    past_ind = int(np.array(e_props[:i, 0]).sum() * len(non_toxic))    
    pres_ind = int(np.array(e_props[:(i+1), 0]).sum() * len(non_toxic))
    nt = non_toxic.iloc[past_ind:pres_ind]

    past_ind = int(np.array(e_props[:i, 1]).sum() * len(toxic))    
    pres_ind = int(np.array(e_props[:(i+1), 1]).sum() * len(toxic))
    t = toxic.iloc[past_ind:pres_ind]
    
    #Make full env 
    env_partitions.append(pd.concat([nt, t], ignore_index=True).sample(frac=1))


## Baseline Logistic Regression

In [None]:
t = GetEmbedding(word2vec, stopwords=STOPWORDS)
train_partition = ToxicityDataset(pd.concat([e for e in env_partitions[:-1]], \
                                            ignore_index=True)[['id', 'toxicity', 'comment_text']], transform=t)[:]  
test_partition = ToxicityDataset(env_partitions[-1][['id', 'toxicity', 'comment_text']], transform=t)[:]

print(train_partition['x'].shape, test_partition['x'].shape)

In [None]:
model = LogisticRegression(fit_intercept = True, penalty = 'l2').fit(train_partition['x'], train_partition['y'])
print('train score: {}'.format(model.score(train_partition['x'], train_partition['y'])))
print('test score: {}'.format(model.score(test_partition['x'], test_partition['y'])))

## IRM Logistic Regression

In [None]:
import ref 
t = GetEmbedding(word2vec, stopwords=STOPWORDS)
train_envs = [ToxicityDataset(e[['id', 'toxicity', 'comment_text']], transform=t)[:] for e in env_partitions[:-1]]
test_partition = ToxicityDataset(env_partitions[-1][['id', 'toxicity', 'comment_text']], transform=t)[:]

print(train_envs[0]['x'].shape, test_partition['x'].shape)

In [None]:
import pickle 
pickle.dump(train_envs, open('tenvs.pkl', 'wb'))

In [None]:
import importlib
importlib.reload(ref)

args = {'lr': 0.0001, \
         'n_iterations':70000, \
         'penalty_anneal_iters':1, \
         'l2_reg':1.0, \
         'pen_wgt':1000, \
         'hid_layers':1, \
         'verbose':False}
base = ref.LinearInvariantRiskMinimization('cls')
model, errors, penalties, losses = base.train(train_envs, 1000, args)

In [None]:
train_logits = base.predict(np.concatenate([train_envs[i]['x'] for i in range(len(train_envs))]), model)
train_labels = np.concatenate([train_envs[i]['y'] for i in range(len(train_envs))])
test_logits = base.predict(test_partition['x'], model)
test_labels = test_partition['y']

train_acc = ref.compute_loss(np.expand_dims(train_logits, axis=1), train_labels, ltype='ACC')
test_acc = ref.compute_loss(np.expand_dims(test_logits, axis=1), test_labels, ltype='ACC')
print('train score: {}'.format(train_acc))
print('test score: {}'.format(test_acc))


## Other

In [None]:
import pickle
pickle.dump(model, open('none.pkl', 'wb'))