In [59]:
import os 
import time
import random
import csv
import pandas as pd

import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score, accuracy_score
from datasets import load_dataset
from toxic_comment_collection import *

#get_dataset('basile2019')
# setting device as GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

# CUDA device information
if device.type == 'cuda':
    print('GPU Name:', torch.cuda.get_device_name(0))
    print('Memory Stats:')
    print('  - Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('  - Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')
    print('  - Total:    ', round(torch.cuda.get_device_properties(0).total_memory/1024**3,1), 'GB')

# for reproducable results
torch.manual_seed(464)
torch.cuda.manual_seed(464)
np.random.seed(464)
random.seed(464)
if device.type == 'cuda':
    torch.backends.cudnn.deterministic=True    
get_dataset('basile2019')


Using device: cuda
GPU Name: NVIDIA GeForce GTX 1060
Memory Stats:
  - Allocated: 0.0 GB
  - Cached:    0.0 GB
  - Total:     6.0 GB


In [60]:
pd.options.mode.chained_assignment = None
basile_ = pd.read_csv("./files/basile2019/basile2019en.csv", sep="\t")#.to_numpy()
gibert_ = pd.read_csv("./files/gibert2018/gibert2018en.csv", sep="\t")#.to_numpy()
chung_ = pd.read_csv("./files/chung2019/chung2019.csv", sep="\t")#.to_numpy()
#print(chung_[0,1])
#print(gibert_[0,1])

basile_['labels'] = basile_['labels'].astype('|S40')
basile_['labels'].loc[basile_['labels'] == b'[]'] = 0
basile_['labels'].loc[basile_['labels'] != b'[]'] = 1
basile_data = basile_['text'].to_list()
basile_labels = basile_['labels'].to_list()
print(len(basile_data))
print(len(basile_labels))

10000
10000


In [61]:
gibert_['labels'] = gibert_['labels'].astype('|S40')
gibert_.drop(gibert_.index[gibert_['labels'] == b"['idk/skip']"], inplace=True)
gibert_.drop(gibert_.index[gibert_['labels'] == b"['relation']"], inplace=True)
gibert_['labels'].loc[gibert_['labels'] == b"['none']"] = 0
gibert_['labels'].loc[gibert_['labels'] == b"['hate']"] = 1
gibert_.head()
gibert_data = gibert_['text'].to_list()
gibert_labels = gibert_['labels'].to_list()
print(len(gibert_labels))
#print(len(gibert_labels[gibert_labels['labels'] == 1]))


chung_['labels'] = chung_['labels'].astype('|S40')
chung_['labels'].loc[chung_['labels'] == b"['hate', 'Islamophobia']"] = 1
chung_data = chung_['text'].to_list()
chung_labels = chung_['labels'].to_list()
#print(chung_data[2])
#print(len(chung_labels))

qian_ = pd.read_csv("./files/qian2019/qian2019en_reddit.csv", sep="\t")
qian_['labels'] = qian_['labels'].astype('|S40')
qian_.drop(qian_.index[qian_['labels'] == b'[]'], inplace=True)
qian_['labels'].loc[qian_['labels'] == b"['hate']"] = 1
qian_data = qian_['text'].to_list()
qian_labels = qian_['labels'].to_list()
print(len(qian_labels))


10703
5256


In [64]:
res_data = basile_data + chung_data + gibert_data + qian_data
print(len(res_data))
res_labels = basile_labels + chung_labels + gibert_labels + qian_labels
print(len(res_labels))

import pickle
with open("hate_dataset", "wb") as fp:   #Pickling
   pickle.dump(res_data, fp)
 
with open("hate_labels", "wb") as fp:   
    pickle.dump(res_data, fp)

26815
26815


In [None]:
a = []
with open("dataset/hate_dataset") as fp:
    a = pickle.load(fp)
print(len(a))