# Extraits de tweets pour les traducteurs

In [1]:
import os
from os.path import join
import re 

import pandas as pd

from utils import query_to_words
from utils import score_language

In [2]:
pd.options.display.max_colwidth = None
pd.options.display.max_columns = 30

In [3]:
def format_df(df, query, sort_by_relevance=True): 
    df = df.copy()
    # Substitute [URL_LINK] for HTTP(S) pattern, clean HTML code.
    df["clean_tweet"] =  df.tweet.str.replace(r'http\S+', '[URL_LINK]') # \S : Matches any character which is not a whitespace character. This is the opposite of \s. If the ASCII flag is used this becomes the equivalent of [^ \t\n\r\f\v].
    for html_code, replace_string in HTML_CODE:
        df["clean_tweet"] = df["clean_tweet"].str.replace(html_code, replace_string, regex=False)
    df = df.drop_duplicates(subset="clean_tweet")
        
    # Calcule score de pertinence des tweets : plus le score est élevé, plus le tweet est pertinent. 
    words = query_to_words(query)
    df["relevance_score"] = df.clean_tweet.str.count(pat="|".join(words))
    
    # Sorting if necessary
    if sort_by_relevance:
        df.sort_values("relevance_score", inplace=True, ascending=False)
    
    return df[["created_at", "username", "name", "clean_tweet", "relevance_score", "tweet"]]
  
def generate_output_for_trad(df, n_sample=100): 
    out = pd.DataFrame({
        "Non conforme": None,
        "Mots-clés relevés (si non conforme)": None, 
        "Remarques": None,
        "Tweet": df.clean_tweet.sample(n_sample).tolist(), 
    })
    
    return out

### CONSTANTS

In [4]:
HTML_CODE = [
    (r'&gt;', '>'),
    (r'&lt;', '<'),
    (r'&amp;', '&'),
]

QUERY_DATA_PATH = "/home/cash/data/queries"
OUTPUT_TRAD_PATH = "/home/cash/output/traduction"

CSV_KWARGS = {
    "sep": ";",
    "encoding": "UTF-8",
    "index": True
}

In [5]:
os.makedirs(OUTPUT_TRAD_PATH, exist_ok=True)

### SCRIPT

In [7]:
# English
QUERY_ENGLISH = '(cash OR banknotes OR banknote OR " note " OR " coin " OR " coins " OR money OR currency OR withdrawals OR withdrawal OR ATM OR ATMs OR cash machines OR branch OR counter OR card OR cards OR "contactless" OR " visa " OR mastercard OR "cash back" OR " NFC " OR "Google Pay" OR "ApplePay" OR "Paylib" OR "Lydia" OR "Lyf Pay" OR "Alipay" OR "Samsung Pay" OR "Stocard Pay" OR "mobile payments" OR "cheques") AND (payment OR payments OR " pay " OR " pays " OR settle OR " buy " OR " buys " OR purchases OR purchase OR withdraw OR spend OR expenses OR spending OR expenditure)'
path_cash_en = join(QUERY_DATA_PATH, "query_en_cash.csv")
path_cb_en = join(QUERY_DATA_PATH, "query_en_cb.csv")

# Spanish
QUERY_SP = '(efectivo OR metálico OR billetes OR billete OR moneda OR monedas OR dinero OR efectivo OR líquido OR retiradas OR retirada OR cajero OR cajeros OR " cajero automático " OR " caja " OR " cajas " OR efectivo OR metálico OR billetes OR billete OR moneda OR monedas OR dinero OR efectivo OR líquido OR retiradas OR retirada OR cajero OR cajeros OR " cajero automático " OR " caja " OR " cajas ") AND (" pago " OR pagos OR pagar OR abonos OR abono OR abonar OR compra OR compras OR comprar OR retirar OR gastar OR gasto OR gastos)'
path_cash_sp = join(QUERY_DATA_PATH, "query_sp_cash_part1.csv")
path_cb_sp = join(QUERY_DATA_PATH, "query_sp_cb.csv")

# German
QUERY_GE = '(bargeld OR banknoten OR banknote OR münze OR münzen OR kleingeld OR " bar " OR abhebungen OR abhebung OR geldautomat OR geldautomaten OR " GAA " OR schalter OR karte OR karten OR " CB " OR kontaktlos OR visa OR mastercard OR cashback OR " NFC " OR "Google Pay" OR ApplePay OR Paylib OR Lydia OR "Lyf Pay" OR  Alipay OR "Samsung Pay" OR "Stocard Pay" OR "mobiles bezahlen" OR schecks) AND (zahlung OR zahlungen OR zahlen OR bezahlungen OR bezahlung OR bezahlen OR kauf OR käufe OR kaufen OR abheben OR ausgeben OR ausgabe OR ausgaben)'
path_cash_ge = join(QUERY_DATA_PATH, "query_ge_cash.csv")
path_cb_ge = join(QUERY_DATA_PATH, "query_ge_cb.csv")

### English query

In [7]:
# output cash 
df_cash_en = pd.read_csv(path_cash_en)
df_cash_en = format_df(df_cash_en, QUERY_ENGLISH)
v1_en_cash_output_for_trad = generate_output_for_trad(df_cash_en)
v1_en_cash_output_for_trad.to_csv(join(OUTPUT_TRAD_PATH, "v1_en_cash_output_for_trad.csv"), **CSV_KWARGS)

In [8]:
# Output cb
df_cb_en = pd.read_csv(path_cb_en, nrows=10_000)
df_cb_en = format_df(df_cb_en, QUERY_ENGLISH)
v1_en_cb_output_for_trad = generate_output_for_trad(df_cb_en)
v1_en_cb_output_for_trad.to_csv(join(OUTPUT_TRAD_PATH, "v1_en_cb_output_for_trad.csv"), **CSV_KWARGS)

### Spanish query 

In [11]:
# output cash 
df_cash_sp = pd.read_csv(path_cash_sp)
df_cash_sp = format_df(df_cash_sp, QUERY_SP)
v1_sp_cash_output_for_trad = generate_output_for_trad(df_cash_sp)
v1_sp_cash_output_for_trad.to_csv(join(OUTPUT_TRAD_PATH, "v1_sp_cash_output_for_trad.csv"), **CSV_KWARGS)

In [12]:
# Output cb
df_cb_sp = pd.read_csv(path_cb_sp)
df_cb_sp = format_df(df_cb_sp, QUERY_SP)
v1_sp_cb_output_for_trad = generate_output_for_trad(df_cb_sp)
v1_sp_cb_output_for_trad.to_csv(join(OUTPUT_TRAD_PATH, "v1_sp_cb_output_for_trad.csv"), **CSV_KWARGS)

### German query

In [8]:
# output cash 
df_cash_ge = pd.read_csv(path_cash_ge)
df_cash_ge = format_df(df_cash_ge, QUERY_GE)
v1_ge_cash_output_for_trad = generate_output_for_trad(df_cash_ge)
v1_ge_cash_output_for_trad.to_csv(join(OUTPUT_TRAD_PATH, "v1_ge_cash_output_for_trad.csv"), **CSV_KWARGS)

In [10]:
# Output cb
df_cb_ge = pd.read_csv(path_cb_ge)
df_cb_ge = format_df(df_cb_ge, QUERY_GE)
v1_ge_cb_output_for_trad = generate_output_for_trad(df_cb_ge)
v1_ge_cb_output_for_trad.to_csv(join(OUTPUT_TRAD_PATH, "v1_ge_cb_output_for_trad.csv"), **CSV_KWARGS)