In [17]:
#### READING ONLY ORIGINAL CATEGORIES

f = open('../data/original_categories.txt','r')
mixed_text = f.read()


## Read in the Data and PreProcess

In [5]:
import os
os.environ["MODIN_ENGINE"] = "ray"  # Modin will use Dask
import pandas as pd
import re
import icu
from tqdm import tqdm
from tqdm._tqdm_notebook import tqdm_notebook

In [32]:
# Data Loading and PreProcessing
import modin.pandas as mpd
import modin.pandas.utils as ut

data_path = os.path.abspath(
    os.path.join(
        os.path.pardir,
        'data',
        'prods.tar.gz'
    )
)

def exclude_unnamed(df):
    matches_to_exclude = re.findall("^'Unnamed[^,]*",str(df.columns.tolist()).strip("[]"))
    m = [str(x).replace('"', '').replace("'","") for x in matches_to_exclude]
    return m

def do_category_pre_processing(source_category="original_categories",\
                               data_path=data_path,\
                              random_n=200):
    from polyglot.detect import Detector
    df = pd.read_csv(data_path)
    if random_n != -1:
        df = df.sample(random_n)
    exclude_these = exclude_unnamed(df)
    df = df.drop(exclude_these,axis=1)
    #df.translation = df.original_categories.apply(return_single_translation)
    tqdm.pandas(desc='PROGRESS>>>') 
    df.loc[:,source_category] = df.loc[:,source_category].astype(str)
    df['poly_obj'] = df.loc[:,source_category].progress_apply(lambda x: Detector(x, quiet=True))
    df['Text-lang'] = df['poly_obj'].progress_apply(lambda x: icu.Locale.getDisplayName(x.language.locale))
    df['confidence'] = df['poly_obj'].progress_apply( lambda x: x.language.confidence)
    df = df.drop(['ean','summary','url','provider','price','brand','./._prods.csv'],axis=1)
    #df = df.loc[:,["name","description","original_categories","internal_tree"]]
    return df

data = do_category_pre_processing()
# data_modin = ut.from_pandas(data) cannot serialise the data right now



PROGRESS>>>: 100%|██████████| 200/200 [00:00<00:00, 5091.04it/s]


PROGRESS>>>: 100%|██████████| 200/200 [00:00<00:00, 23142.90it/s]


PROGRESS>>>: 100%|██████████| 200/200 [00:00<00:00, 69516.93it/s]


### Then Make the Translation work on Pandas Dataframe

In [33]:
data.head()

Unnamed: 0,name,description,original_categories,internal_tree,brand,poly_obj,Text-lang,confidence
691466,Leggings Leggingsit Musta Lindex,Lindex.,Kids|Leggings,Children | Vaatteet | Housut,Lindex,Prediction is reliable: True\nLanguage 1: name...,English,93.0
652860,Polo Shirts Lyhythihainen Poolopaita Harmaa Es...,Esprit Casual Polo Shirts,Men|Polos,Fashion | Vaatteet | Miehet | Paidat | Poolot,Esprit Casual,Prediction is reliable: False\nLanguage 1: nam...,Galician,90.0
51738,Womens Ultra Flex,Skechers Womens Ultra Flex,Women|Shoes,Fashion | Kengät | Naiset,Skechers,Prediction is reliable: True\nLanguage 1: name...,English,92.0
475617,Iron Gym® Exercise & Yoga Mat With Carry Strap...,Iron Gym® Exercise & Yoga Mat with Carry Strap...,Varusteet & Tarvikkeet|Välineet|Joogamatot,Fitness | Treenivarusteet | Jooga ja pilates,Iron Gym,Prediction is reliable: True\nLanguage 1: name...,Finnish,97.0
592884,Happy Holly Ida Suede Look Skirt Beige 40/42,HAPPY HOLLY tyylikäs kynähame mokkajäljitelmää...,Vaatteet|Hameet|Midihameet,Fashion | Vaatteet | Naiset | Hameet | Midihameet,Happy Holly,Prediction is reliable: True\nLanguage 1: name...,Finnish,96.0


In [9]:
#!jupyter nbextension enable --py widgetsnbextension
#!pip install modin[ray] --user

In [51]:

def return_single_translation(series,source_lang,translator_client):
    return translator_client.translate_text(Text=series,
            SourceLanguageCode="en",
            TargetLanguageCode="fi").get("TranslatedText")
    
def row_translator(row,**kwargs):
    """ Translates the original_categories feature if it is not Finnish.
    Univariate gives ENG to FIN translation, otherwise the most confident translation direction is used"""
    new_name = kwargs.get("new_name")
    translator_client = kwargs.get("translator_client")
    translation_mode = kwargs.get("translation_mode")
    old_name = kwargs.get("old_name")
    if row.loc["Text-lang"]=="Finnish":
        row.loc[new_name]  = row.loc[old_name]
    else:
        if translation_mode == "univariate":
            row.loc[new_name] = return_single_translation(series=row.original_categories,source_lang='en',translator_client=translator_client)
        else :
            row.loc[new_name] = return_single_translation(series=row.original_categories,source_lang='auto',translator_client=translator_client)
    return row

def translation_function(data,old_name="original_categories",new_prefix="trans"):
    import boto3
    from polyglot.detect import Detector
    new_name = f"{new_prefix}_{old_name}"
    translator_client = boto3.client("translate",region_name="us-east-1")
    kwargs = {"translator_client":translator_client,"new_name":new_name,\
             "translation_mode":"univariate","old_name":old_name}
    print("REACHED PROGRESS APPLY")
    tqdm.pandas(desc='PROGRESS>>>') 
    translated = data.progress_apply(row_translator,axis=1,**kwargs)
    translated = translated.drop(["poly_obj","brand"],axis=1)
    return translated,new_name

translated_,new_name = translation_function(data)








PROGRESS>>>:   0%|          | 0/200 [00:00<?, ?it/s][A[A[A[A[A[A[A

REACHED PROGRESS APPLY









PROGRESS>>>:   1%|          | 2/200 [00:01<02:12,  1.49it/s][A[A[A[A[A[A[A






PROGRESS>>>:   2%|▏         | 3/200 [00:01<01:50,  1.79it/s][A[A[A[A[A[A[A






PROGRESS>>>:   2%|▏         | 4/200 [00:01<01:36,  2.03it/s][A[A[A[A[A[A[A






PROGRESS>>>:   2%|▎         | 5/200 [00:02<01:27,  2.24it/s][A[A[A[A[A[A[A






PROGRESS>>>:   4%|▍         | 8/200 [00:02<01:06,  2.88it/s][A[A[A[A[A[A[A






PROGRESS>>>:   6%|▌         | 12/200 [00:02<00:50,  3.74it/s][A[A[A[A[A[A[A






PROGRESS>>>:   6%|▋         | 13/200 [00:03<00:53,  3.48it/s][A[A[A[A[A[A[A






PROGRESS>>>:   7%|▋         | 14/200 [00:03<00:54,  3.44it/s][A[A[A[A[A[A[A






PROGRESS>>>:   8%|▊         | 15/200 [00:03<00:51,  3.57it/s][A[A[A[A[A[A[A






PROGRESS>>>:   9%|▉         | 18/200 [00:04<00:41,  4.39it/s][A[A[A[A[A[A[A






PROGRESS>>>:  10%|█         | 20/200 [00:04<00:37,  4.83it/s][A[A[A[A[A[A[A






PROGRESS>>>:  11%|█

PROGRESS>>>:  84%|████████▍ | 169/200 [00:32<00:06,  4.72it/s][A[A[A[A[A[A[A






PROGRESS>>>:  86%|████████▋ | 173/200 [00:33<00:04,  5.60it/s][A[A[A[A[A[A[A






PROGRESS>>>:  87%|████████▋ | 174/200 [00:33<00:08,  3.24it/s][A[A[A[A[A[A[A






PROGRESS>>>:  88%|████████▊ | 176/200 [00:34<00:06,  3.64it/s][A[A[A[A[A[A[A






PROGRESS>>>:  89%|████████▉ | 178/200 [00:34<00:05,  3.72it/s][A[A[A[A[A[A[A






PROGRESS>>>:  90%|████████▉ | 179/200 [00:34<00:05,  3.61it/s][A[A[A[A[A[A[A






PROGRESS>>>:  91%|█████████ | 182/200 [00:35<00:04,  4.15it/s][A[A[A[A[A[A[A






PROGRESS>>>:  92%|█████████▏| 183/200 [00:35<00:04,  3.82it/s][A[A[A[A[A[A[A






PROGRESS>>>:  92%|█████████▏| 184/200 [00:36<00:05,  3.13it/s][A[A[A[A[A[A[A






PROGRESS>>>:  92%|█████████▎| 185/200 [00:36<00:06,  2.41it/s][A[A[A[A[A[A[A






PROGRESS>>>:  93%|█████████▎| 186/200 [00:37<00:05,  2.43it/s][A[A[A[A[A[A[A






PROGRESS>>

In [52]:
translated_.head()

Unnamed: 0,name,description,original_categories,internal_tree,brand,poly_obj,Text-lang,confidence,trans_original_categories
691466,Leggings Leggingsit Musta Lindex,Lindex.,Kids|Leggings,Children | Vaatteet | Housut,Lindex,Prediction is reliable: True\nLanguage 1: name...,English,93.0,Lapset|Leggingsit
652860,Polo Shirts Lyhythihainen Poolopaita Harmaa Es...,Esprit Casual Polo Shirts,Men|Polos,Fashion | Vaatteet | Miehet | Paidat | Poolot,Esprit Casual,Prediction is reliable: False\nLanguage 1: nam...,Galician,90.0,Miehet|Polos
51738,Womens Ultra Flex,Skechers Womens Ultra Flex,Women|Shoes,Fashion | Kengät | Naiset,Skechers,Prediction is reliable: True\nLanguage 1: name...,English,92.0,Naiset|Kengät
475617,Iron Gym® Exercise & Yoga Mat With Carry Strap...,Iron Gym® Exercise & Yoga Mat with Carry Strap...,Varusteet & Tarvikkeet|Välineet|Joogamatot,Fitness | Treenivarusteet | Jooga ja pilates,Iron Gym,Prediction is reliable: True\nLanguage 1: name...,Finnish,97.0,Varusteet & Tarvikkeet|Välineet|Joogamatot
592884,Happy Holly Ida Suede Look Skirt Beige 40/42,HAPPY HOLLY tyylikäs kynähame mokkajäljitelmää...,Vaatteet|Hameet|Midihameet,Fashion | Vaatteet | Naiset | Hameet | Midihameet,Happy Holly,Prediction is reliable: True\nLanguage 1: name...,Finnish,96.0,Vaatteet|Hameet|Midihameet


In [97]:
import re
""" Run the translations only on unique categories and later you can use that dictionary to look up translations for any data"""
def post_process_features(data,name,mode="translationcheck"):
    """ Add regex substitutions here"""
    #data = data.reset_index(drop=True)
    data.trans_original_categories = data.trans_original_categories.\
    replace("Male","Miehet",regex=False).\
    replace("Female","Naiset",regex=False)
    
    
    if mode == "normal":
        sel = data.loc[:,["trans_original_categories", "name","description","internal_tree"]]
       
    elif mode == "translationcheck":
        sel = data.loc[:,["trans_original_categories","original_categories"]]
    filename = os.path.join("..","data",name + ".csv")
    sel.to_csv(filename)
    return sel
             
#d = post_process_features(data=translated_,name="translations_post")

#re.findall("[Ff]emale",str(d.trans_original_categories.values))

['Female', 'Female']

In [96]:
#filename = os.path.join("..","data","translationcheck" + ".csv")
d.trans_original_categories.values
#!cat 

array(['Lapset|Leggingsit', 'Miehet|Polos', 'Naiset|Kengät',
       'Varusteet & Tarvikkeet|Välineet|Joogamatot',
       'Vaatteet|Hameet|Midihameet', 'Naiset|Kengät',
       'Naisten|Kengät|Sandaalit', 'Vaatteet',
       'Huonekalut|Huonekalujen Tarvikkeet|Huonekalujen Verhoilu',
       'Naiset|T-paidat & Topit', 'Naiset|Kauneus Naiset', 'Naiset|Mekot',
       'Miehet|T-paidat', 'Kauneudenhoito', 'Kellot|Naisten Kellot',
       'Ihonhoito', 'Miesten|Kengät|Juoksukengät',
       'Lapset|Päällysvaatteet', 'Naisten|Kengät|Matalavartiset Tennarit',
       'Naiset|Housut', 'Kellot|Miesten Kellot', 'Kellot|Miesten Kellot',
       'Vaatteet', 'Vaatteet|Takit|Takkeja', 'Naiset|Kauneus Naiset',
       'Tuotemerkit/Baci|Asut/Rooli- ja naamiaisasut', 'Naiset|Hameet',
       'Naisten Wear|Sisäpohjat|Shortsit',
       'Miesten|Kengät|Matalavartiset Tennarit', 'Kengät|Lenkkarit',
       'Ale Ale Shortsit', 'Nainen|Kengät|Flats|Sandaalit',
       'Nainen|Kauneus|Meikit|Primers', 'Vaatteet', 'Kauneud

In [93]:

%automagic 1
#%cat ../data/translationcheck.csv


Automagic is ON, % prefix IS NOT needed for line magics.
cat: ../data/translationcheck.csv: No such file or directory


In [233]:
#df.Text = df.original_categories.astype(str)
# basically we need to apply this function to every row in the dataframe and later speed it up
# translation_function is used in the next code cell, so make sure that you use the
# apply + iterrows combination in the right way!
# translations = []
# or row in df.head().itertuples():
#    #if row.confidence > 90:
#    out = translator_client.translate_text(Text=row.original_categories,
#            SourceLanguageCode='en',
#            TargetLanguageCode='fi').get("TranslatedText")
#    translations.append(out)
#    #tuples = pd.concat([tuples,pd.DataFrame.from_dict(out.get("TranslatedText"))],axis=0)


import icu


def row_translator(row,**kwargs):
    print()
    translator_client = kwargs.get("translator_client")
    if row.loc['Text-LangConfidence'] > 90:
        row.loc['translated'] = translator_client.translate_text(Text=row.loc['original_categories'],
                SourceLanguageCode='en',
                TargetLanguageCode='fi').get("TranslatedText")
    else : 
        row.loc['translated'] = translator_client.translate_text(Text=row.loc['original_categories'],
                SourceLanguageCode='auto',
                TargetLanguageCode='fi').get("TranslatedText")
    return row

def translation_function(data,old_name="original_categories"):
    import boto3
    from polyglot.detect import Detector
    translator_client = boto3.client('translate',region_name='us-east-1')
    kwargs = {'translator_client':translator_client,'new_name':f"trans_{old_name}"}
    translated = data.head().apply(row_translator,axis=1,**kwargs)
    return translated
    
        

translated = translation_function(df)









In [234]:
translated.head()

Unnamed: 0,original_categories,internal_tree,poly_obj,Text-lang,Text-LangConfidence,confidence,translated
0,Tuotemerkit/Jimmyjane|Terveystuotteet/Kuukuppi,Health | Intiimituotteet,Prediction is reliable: True\nLanguage 1: name...,Finnish,97.0,97.0,Tuotetiedot/Jimmyjane|Terveystuotteet/Kuuppi
1,Tuotemerkit/Jimmyjane|Terveystuotteet/Kuukuppi,Health | Intiimituotteet,Prediction is reliable: True\nLanguage 1: name...,Finnish,97.0,97.0,Tuotetiedot/Jimmyjane|Terveystuotteet/Kuuppi
2,Kondomit|Tuotemerkit/Durex,Health | Intiimituotteet | Kondomit,Prediction is reliable: True\nLanguage 1: name...,Finnish,96.0,96.0,Kondomit|Tuotemerki/Durex
3,Kondomit|Tuotemerkit/Durex,Health | Intiimituotteet | Kondomit,Prediction is reliable: True\nLanguage 1: name...,Finnish,96.0,96.0,Kondomit|Tuotemerki/Durex
4,Kondomit|Tuotemerkit/Durex,Health | Intiimituotteet | Kondomit,Prediction is reliable: True\nLanguage 1: name...,Finnish,96.0,96.0,Kondomit|Tuotemerki/Durex


In [167]:
for index,row in df.iterrows():
    row.loc['translated'] = translator_client.translate_text(Text=row.loc['original_categories'],
                    SourceLanguageCode='en',
                    TargetLanguageCode='fi').get("TranslatedText")


In [169]:
df.columns

Index(['original_categories', 'internal_tree', 'poly_obj', 'Text-lang',
       'Text-LangConfidence'],
      dtype='object')

In [122]:

import icu
df.Text = df.Text.astype(str)
df['poly_obj'] = df.Text.apply(lambda x: Detector(x, quiet=True))
df['Text-lang'] = df['poly_obj'].apply(lambda x: icu.Locale.getDisplayName(x.language.locale))
df['Text-LangConfidence'] = df['poly_obj'].apply( lambda x: x.language.confidence)

def translation_function(data):
    import boto3
    from polyglot.detect import Detector
    translator_client = boto3.client('translate',region_name='us-east-1')
    if data['Text-LangConfidence'] > 90:
        response = translator_client.translate_text(Text=data['original_categories'],
                SourceLanguageCode='en',
                TargetLanguageCode='fi')
    else : 
        response = translator_client.translate_text(Text=data['original_categories'],
                SourceLanguageCode='auto',
                TargetLanguageCode='fi')
    return response.get("TranslatedText")

def check_confidence_and_translate(df,col_to_translate):
    flag = 0
    selected_series = df.loc[:,col_to_translate]
    df['poly_obj'] = selected_series.apply(lambda x: Detector(x, quiet=True))
    df['Text-LangConfidence'] = df['poly_obj'].apply( lambda x: x.language.confidence)
    new_col_name = f"translated_{col_to_translate}""
    df[new_col_name] = df.apply(translation_function)
    if language.confidence > 90.0 and language.code == 'en':
                flag = 1
                response = translator_client.translate_text(Text=a,
                SourceLanguageCode='en',
                TargetLanguageCode='fi')
                print(response.get("TranslatedText"))
                break


In [123]:

flag = 0
for language in Detector(a).languages:
        if language.confidence > 90.0 and language.code == 'en':
            flag = 1
            response = translator_client.translate_text(Text=a,
            SourceLanguageCode='en',
            TargetLanguageCode='fi')
            print(response.get("TranslatedText"))
            break
            
if flag == 0:
    response = translator_client.translate_text(Text=a,
    SourceLanguageCode='auto',
    TargetLanguageCode='fi')
    print("English not detected with sufficient confidence, reverting to auto-translate")
    print(response.get("TranslatedText"))

Unnamed: 0,original_categories,internal_tree,poly_obj,Text-lang,Text-LangConfidence
0,Tuotemerkit/Jimmyjane|Terveystuotteet/Kuukuppi,Health | Intiimituotteet,Prediction is reliable: True\nLanguage 1: name...,Finnish,97.0
1,Tuotemerkit/Jimmyjane|Terveystuotteet/Kuukuppi,Health | Intiimituotteet,Prediction is reliable: True\nLanguage 1: name...,Finnish,97.0
2,Kondomit|Tuotemerkit/Durex,Health | Intiimituotteet | Kondomit,Prediction is reliable: True\nLanguage 1: name...,Finnish,96.0
3,Kondomit|Tuotemerkit/Durex,Health | Intiimituotteet | Kondomit,Prediction is reliable: True\nLanguage 1: name...,Finnish,96.0
4,Kondomit|Tuotemerkit/Durex,Health | Intiimituotteet | Kondomit,Prediction is reliable: True\nLanguage 1: name...,Finnish,96.0


In [24]:
detected_languages = Detector(a).languages
# you can access detected_languages as Pandas dataFrame api



AttributeError: 'list' object has no attribute 'get'