In [9]:
import os
import pandas as pd 

In [10]:
list_files = os.listdir()[1:]

In [42]:
nancy_df = pd.DataFrame()

In [43]:
for data in list_files:
    df = pd.read_csv(data)
    nancy_df = pd.concat([nancy_df, df])

In [44]:
nancy_df.index

Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
       ...
       41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
      dtype='int64', length=567)

In [45]:
nancy_df.shape

(567, 20)

In [46]:
nancy_df.columns

Index(['Pl', 'Numéros réponse', 'Position', 'Réponse (French)',
       'Answer (English)', 'Enquête (French)', 'Inquiry (English)',
       'Localisation', 'Déterminant', 'Contenu', '4e colonne', 'LOC', 'DET',
       'CON', 'QUAL', 'Unnamed: 11', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10'],
      dtype='object')

We can see that Localisation, Determinant and Contenu are formated in two different columns

In [16]:
nancy_df["QUAL"].notna().sum()

84

In [17]:
# nancy_df['Réponse (French)']

In [47]:
cols_drop = ['Pl', 'Numéros réponse', 'Position',
            'Enquête (French)', 'Inquiry (English)',
            'Localisation', '4e colonne',
            'LOC', 'QUAL',
            'Unnamed: 11', 'Unnamed: 7', 
            'Unnamed: 8', 'Unnamed: 9',
            'Unnamed: 10']

In [48]:
nancy_df.drop(cols_drop, axis=1, inplace=True)

In [49]:
nancy_df["Déterminant"] = nancy_df[["Déterminant", "DET"]].bfill(axis=1).iloc[:,0]
nancy_df["Contenu"] = nancy_df[["Contenu", "CON"]].bfill(axis=1).iloc[:,0]

In [50]:
nancy_df.drop(["DET", "CON"], axis = 1, inplace=True)

We can see that there were a bunch of responses that were not translated

In [26]:
nancy_df[(nancy_df["Answer (English)"].isna()) & (nancy_df["Réponse (French)"].notna())]

Unnamed: 0,Réponse (French),Answer (English),Déterminant,Contenu
51,"Argh en plus on peut\n même faire un éléphant,...",,F-,A
52,Ça peut être des chevaux\n aussi,,F-,A
53,"Mais un peu sur la même\n logique, en fait, je...",,F-,(A)
54,Et là dans le sens-là (me\n montre la planche ...,,FC’E+,H / Vêt
55,"Mais en fait, sur le\n principe ça me fait plu...",,kobC’E-,Frag / Art
56,"Et aussi un éléphant, avec\n les yeux, la trom...",,F+,A
57,"Et alors comme ça, ça\n fait vachement penser ...",,kob -,Elem
58,T’as une petite île où t’as\n plein de fumée q...,,kobE-,Pays/ Elem
59,"Et maintenant que j’ai vu\n ça, de l’autre côt...",,kob-,Elem
60,"Ça me fait penser à un\n voilier aussi, mais a...",,FC’-,Obj


In [32]:
to_translate = nancy_df[(nancy_df["Answer (English)"].isna()) & (nancy_df["Réponse (French)"].notna())]

In [23]:
with open(r"..\DEEPL_KEY\DEEPL_API_KEY.txt", "r") as f:
    API_KEY = f.read().strip("\n")

In [35]:
import deepl

def translate_text(text):
    
    translator = deepl.Translator(API_KEY)
    result = translator.translate_text(text, target_lang="EN-US", preserve_formatting=True)
    return result

In [36]:
# to_translate["Answer (English)"] = to_translate["Réponse (French)"].apply(translate_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_translate["Answer (English)"] = to_translate["Réponse (French)"].apply(translate_text)


In [51]:
nancy_df = pd.concat([nancy_df.reset_index(drop=True), to_translate.reset_index(drop=True)], axis=0)

In [109]:
nancy_df.dropna(inplace=True)

In [110]:
nancy_df.to_csv("Nancy_clean.csv")

Separating the labels from determinants and contents

In [111]:
import pandas as pd 
import re

nancy_df = pd.read_csv("Nancy_clean.csv")

In [112]:
nancy_contents = nancy_df[["Answer (English)", "Contenu"]]
nancy_determinants = nancy_df[["Answer (English)", "Déterminant"]]

### Content labels

In [113]:
nancy_contents["Contenu"].value_counts()

Contenu
A                   85
Hd                  32
Ad                  29
Obj                 23
Anat                14
                    ..
H/Obj/Vet            1
(Hd)/Symb            1
A/Frag/sg            1
H/Hd                 1
H / Nat\n /Scène     1
Name: count, Length: 109, dtype: int64

In [114]:
labels = nancy_contents["Contenu"].str.replace("\n", "")
nancy_contents["Contenu"] = labels
# nancy_contents

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  nancy_contents["Contenu"] = labels


In [121]:
def split_labels(text):
    text = str(text).replace(" ", "").replace("\r", "").upper()
    labels = text.split("/")
    return labels

In [122]:
labels_contents = nancy_contents["Contenu"].apply(split_labels)

In [123]:
labels_contents

0                         [(A)]
1                           [H]
2                         [OBJ]
3                           [A]
4                          [HD]
                 ...           
375                      [ALIM]
376                   [A, ALIM]
377    [PAYS, ARCH, ELEM, FRAG]
378                         [A]
379             [H, NAT, SCÈNE]
Name: Contenu, Length: 380, dtype: object

In [124]:
labels_contents = labels_contents.str.join('|').str.get_dummies()

In [126]:
labels_contents.shape

(380, 35)

In [125]:
labels_contents.columns

Index(['(A)', '(AD)', '(H)', '(HD)', 'A', 'ABS', 'AD', 'AENF', 'ALIM', 'ANAT',
       'ARCH', 'ART', 'BOT', 'ELEM', 'EXPLOSION', 'FRAG', 'GEO', 'GÉO', 'H',
       'HBARRÉ', 'HD', 'HENF', 'MQ', 'NAT', 'OBJ', 'PAYS', 'RADIO', 'SCEN',
       'SCÈNE', 'SEX', 'SG', 'SQUEL', 'SYMB', 'VET', 'VÊT'],
      dtype='object')

Some labels have been written in different ways and we'll have to make a translation dictionary of some sorts to pass at the same time as the splitting function

### Determinant labels

In [107]:
nancy_determinants["Déterminant"].value_counts()

Déterminant
F-                       87
F+                       57
FC’-                     15
FC-                      14
F+/-                      9
                         ..
EKob\r\nclob\r\nC’-\r     1
Kob-                      1
KobE C'-                  1
E KanC-                   1
kob +- C                  1
Name: count, Length: 114, dtype: int64