In [1]:
# coding: utf-8
import gc
import multiprocessing
import sys
import time
import numpy as np
import pandas as pd
import textblob
from functools import partial
from tqdm import tqdm, tqdm_pandas

In [2]:
WORKER_NUMBER = 5 # change here [1-10]
MAX_WORKER_NUMBER = 10

In [3]:
# params
n_workers = 4
from_lang = 'en'
to = 'de'
text_column = 'comment_text'

# paths
INPUT_TRAIN_PATH = '../input/train.csv'
INPUT_TEST_PATH = '../input/test.csv'
OUTPUT_TRAIN_PATH = 'train_de.pkl'
OUTPUT_TEST_PATH = 'test_de.pkl'

In [4]:
def translate(x, from_lang='en', to='de'):
    try:
        return ''.join(textblob.TextBlob(x).translate(from_lang=from_lang, to=to))
    except:
        return x
    
def map_translate(df):
    tqdm.pandas(tqdm())
    translated = df.progress_map(lambda x: translate(x, from_lang=from_lang, to=to))
    return translated

In [5]:
# load
train = pd.read_csv(INPUT_TRAIN_PATH)
# test = pd.read_csv(INPUT_TEST_PATH)
len_train = len(train)
# train = pd.concat([train, test], sort=True)
print(train.shape)
train.head(3)

(1804874, 45)


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:41.987077+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:42.870083+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2015-09-29 10:50:45.222647+00,2,,2006,rejected,0,0,0,0,0,0.0,0,4


In [6]:
# split by worker number
n_samples = (len_train // MAX_WORKER_NUMBER) + 1
print(n_samples)
train = train.iloc[(WORKER_NUMBER - 1) * n_samples: min(WORKER_NUMBER * n_samples, len_train)]
train.head(3)

180488


Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
721952,5005389,0.0,A girl's gotta make a living that's why she to...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-03-15 00:43:41.390866+00,22,5004437.0,319233,approved,1,1,0,1,0,0.0,4,4
721953,5005390,0.0,Christy Clark will fight for reform... When th...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2017-03-15 00:44:13.740713+00,54,,319388,approved,0,0,0,5,0,0.0,0,4
721954,5005391,0.0,Defend our borders and close the Safe Third Co...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2017-03-15 00:43:55.485906+00,54,,319416,approved,1,0,0,20,1,0.0,0,4


In [7]:
# translate
with multiprocessing.Pool(processes=n_workers) as p:
    split_dfs = np.array_split(train[text_column], n_workers)
    pool_results = p.map(map_translate, split_dfs)
train[text_column] = pd.concat(pool_results, sort=True)

0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
  0%|          | 0/45122 [00:00<?, ?it/s][A
  0%|          | 2/45122 [00:00<3:08:42,  3.98it/s][A
  0%|          | 2/45122 [00:00<3:48:06,  3.30it/s][A
  0%|          | 3/45122 [00:00<3:17:40,  3.80it/s][A
  0%|          | 2/45122 [00:01<6:48:46,  1.84it/s][A
  0%|          | 3/45122 [00:01<5:36:07,  2.24it/s][A
  0%|          | 4/45122 [00:01<4:05:07,  3.07it/s][A
  0%|          | 3/45122 [00:01<5:07:31,  2.45it/s][A
  0%|          | 2/45122 [00:01<7:52:57,  1.59it/s][A
  0%|          | 5/45122 [00:01<4:17:19,  2.92it/s][A
  0%|          | 4/45122 [00:01<5:36:52,  2.23it/s][A
  0%|          | 4/45122 [00:02<6:31:26,  1.92it/s][A
  0%|          | 6/45122 [00:02<6:11:56,  2.02it/s][A
  0%|          | 5/45122 [00:02<6:37:38,  1.89it/s][A
  0%|          | 3/45122 [00:02<10:13:34,  1.23it/s][A
  0%|          | 4/45122 [00:02<7:36:03,  1.65it/s] [A
  0%|          | 5/45122 [00:02<8:12:54,  1.53it/s]

In [8]:
train.head(3)

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,bisexual,black,buddhist,christian,female,heterosexual,hindu,homosexual_gay_or_lesbian,intellectual_or_learning_disability,jewish,latino,male,muslim,other_disability,other_gender,other_race_or_ethnicity,other_religion,other_sexual_orientation,physical_disability,psychiatric_or_mental_illness,transgender,white,created_date,publication_id,parent_id,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
721952,5005389,0.0,Ein Mädchen muss seinen Lebensunterhalt verdie...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2017-03-15 00:43:41.390866+00,22,5004437.0,319233,approved,1,1,0,1,0,0.0,4,4
721953,5005390,0.0,Christy Clark wird für Reformen kämpfen ... We...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2017-03-15 00:44:13.740713+00,54,,319388,approved,0,0,0,5,0,0.0,0,4
721954,5005391,0.0,Verteidigen Sie unsere Grenzen und schließen S...,0.0,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,,,,,,,,2017-03-15 00:43:55.485906+00,54,,319416,approved,1,0,0,20,1,0.0,0,4


In [9]:
# save
# test = train.iloc[len_train:].reset_index(drop=True)
# train = train.iloc[:len_train].reset_index(drop=True)
train.to_pickle(OUTPUT_TRAIN_PATH)
# test.to_pickle(OUTPUT_TEST_PATH)