In [1]:
from pathlib import Path
import pandas as pd
import os
from datetime import datetime
import json


In [2]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', -1)
pd.set_option('max_colwidth', None)


In [3]:
data_dir = Path('../data')

In [4]:
!ls $data_dir

data_2020-12-27_19-36.json     offenseval-gr-labela-v1.csv
data_2021-01-02_15-42.json     offenseval-gr-test-v1.tsv
data.json		       offenseval-gr-training-v1.tsv
labels-levela.csv	       offenseval-tr-labela-v1.tsv
offenseval-ar-labela-v1.csv    offenseval-tr-testset-v1.tsv
offenseval-ar-test-v1.tsv      offenseval-tr-training-v1.tsv
offenseval-ar-training-v1.tsv  olid-training-v1.0.tsv
offenseval-da-test-v1.tsv      original_zips
offenseval-da-training-v1.tsv  testset-levela.tsv


## English

In [15]:
eng = pd.read_csv(data_dir/"olid-training-v1.0.tsv", sep='\t')
eng = eng.drop(['subtask_b', 'subtask_c'], axis=1).rename(columns={'subtask_a': 'label'})

In [16]:
eng_t_ex = pd.read_csv(data_dir/'testset-levela.tsv', sep='\t').set_index('id')

eng_t_labels = pd.read_csv(data_dir/'labels-levela.csv', header=None, names=['id', 'label']).set_index('id')

eng_test = eng_t_ex.join(eng_t_labels,)

In [17]:
len(eng)+ len(eng_test)

14100

## other langs

In [18]:
LANGS = list(set([str(i).split('-')[1] for i in data_dir.glob('offens*')]))

In [33]:
data = {}
for lang in LANGS:
    data.setdefault(lang, {})
    lang_files = [i for i in data_dir.glob(f'*{lang}*')]
    
    train_file = [i for i in lang_files if 'train' in str(i)][0]
    assert str(train_file).endswith('.tsv')
    train = pd.read_csv(train_file, sep='\t').rename(columns={'subtask_a': 'label'})
    
    if lang == 'da':
        train =train.iloc[:-1]
        data[lang]['train'] = train

        test_file = [i for i in lang_files if 'test' in str(i)][0]
        test_data = pd.read_csv(test_file, sep='\t').rename(columns={'subtask_a': 'label'})
        data[lang]['test'] = test_data
        continue
        
    data[lang]['train'] = train

    test_label_file = [i for i in lang_files if 'labela' in str(i)][0]
    
    if str(test_label_file).endswith('.tsv') and lang != 'tr':
        test_labels = pd.read_csv(test_label_file, sep='\t', header=None, names=['id', 'label'])
    else:
        test_labels = pd.read_csv(test_label_file, header=None, names=['id', 'label'])
        
    test_labels = test_labels.set_index('id')

    test_examples_file = [i for i in lang_files if'test' in str(i)][0]
    assert str(test_examples_file).endswith('.tsv') 
    
    test_examples = pd.read_csv(test_examples_file, sep='\t').set_index('id')

    test_data = test_examples.join(test_labels,)
    print(len(test_examples), len(test_labels), len(test_data))
    
    data[lang]['test'] = test_data.reset_index()

    
data.setdefault('en', {})
data['en']['train'] = eng
data['en']['test'] = eng_test.reset_index()  

3515 3528 3515
1827 2000 1827
1544 1544 1544


In [55]:


def dump_data(data_dict, filename=f"data_{datetime.today().strftime('%Y-%m-%d_%H-%M')}"):
    print(filename)
    json.dump({i: {'train': v['train'].to_json(),
                   'test': v['test'].to_json(),}
                    for i,v in data.items()
              },
              open(data_dir/f'{filename}.json', 'w+')) 
              
def read_data(json_path):
    data = json.load(open(json_path, 'r'))
    for key, lang_dict in data.items():
        for k, v in lang_dict.items():
            data[key][k] = pd.read_json(v)
    return data

In [19]:
data = read_data(data_dir/"data_2020-12-27_19-36.json")

In [56]:
dump_data(data)

data_2021-02-02_23-55


## preprocessing


In [48]:
# from wordsegment import load, segment
import emoji
import re


In [52]:

def eng_segment_hashtags(s):
    load()
    hashtags = set(part for part in s.split() if part.startswith('#'))
    for hashtag in hashtags:
        segmented = " ".join(segment(hashtag))
        s = s.replace(hashtag, segmented)
    return s

def preprocess_tweet(tweet):
    tweet = tweet.replace(':', ' ')
    tweet = ' '.join(tweet.split()) # remove multiple spaces
    
    tweet = re.sub(r'(@USER ){2,}', r'\1', tweet) # replace multiple User mentions with one only
    tweet = re.sub(r'(#(\S\w*))',r'\2', tweet) # remove hashtags signs
    tweet = re.sub(r'(_){2,}', r'\1', tweet).replace('_', ' ')  # replace underscores with spaces
    return tweet

In [53]:
for lang, lang_dict in data.items():
    print(lang)
    for type_, df in lang_dict.items():
        print(f"____{type_}")
        df = df.dropna().reset_index(drop=True)
#         df['tweet'] = df['tweet'].apply(lambda x: emoji.demojize(x))
        df['tweet'] = df['tweet'].apply(lambda x: preprocess_tweet(x))
        data[lang][type_] = df


tr
____train
____test
da
____train
____test
ar
____train
____test
gr
____train
____test
en
____train
____test


# Not used

In [34]:
from nltk.tokenize.stanford_segmenter import StanfordSegmenter

In [55]:
os.environ["STANFORD_MODELS"] = "/home/zeina/Documents/nlp_project/dependencies/stanford-segmenter-4.2.0/stanford-segmenter-2020-11-17/data/"

'/home/zeina/documents/nlp_project/dependencies/stanford-segmenter-4.2.0/stanford-segmenter-2020-11-17/data/'

In [68]:
seg = StanfordSegmenter(path_to_jar='dependencies/stanford-segmenter-4.2.0/stanford-segmenter-2020-11-17/stanford-segmenter-4.2.0.jar')
seg.default_config('ar')
sent = "<LF> <LF>#بالملليميتر_يا_حبيبي URL	"
print(seg.segment(sent.split()))


The StanfordTokenizer will be deprecated in version 3.2.5.
Please use [91mnltk.parse.corenlp.CoreNLPTokenizer[0m instead.'
  """Entry point for launching an IPython kernel.


['<', ' ', 'L', 'F', ' ', '>', ' ', '<', ' ', 'L', 'F', ' ', '>', '#', ' ', 'ب', ' ', 'ا', 'ل', 'م', 'ل', 'ل', 'ي', 'م', 'ي', 'ت', 'ر', 'ي', 'ا', 'ح', 'ب', 'ي', 'ب', 'ي', ' ', 'U', 'R', 'L', '\n']
