# Installation

In [1]:
%pip install spacy simplemma  datasets zeugma

Collecting simplemma
  Downloading simplemma-0.9.1-py3-none-any.whl (75.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.5/75.5 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m54.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting zeugma
  Downloading zeugma-0.49.tar.gz (9.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

 # Preprocessing

##                     import packages

In [2]:
from datasets import load_dataset
import nltk
import pickle
from nltk.stem.snowball import FrenchStemmer
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
translation_dataset = load_dataset('Nicolas-BZRD/Parallel_Global_Voices_English_French',
split='train').to_pandas()

translation_dataset.head()
# do padding

Downloading readme:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/57.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/342060 [00:00<?, ? examples/s]

Unnamed: 0,en,fr
0,Jamaica: “I am HIV”,Jamaïque : J’ai le VIH
1,"It's widely acknowledged, in the Caribbean and...","Il est largement reconnu, dans les Caraïbes et..."
2,"For this woman, however, photographed in the s...","Pour cette femme, cependant, photographiée dan..."
3,As Bacon writes on her blog:,Comme Bacon écrit sur son blog:
4,"“When I asked to take her picture, I suggested...",“Quand je lui ai demandé de la prendre en phot...


In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

## Punctuation Removal

In [5]:
# PUNCTUATION REMOVAL
import string
string.punctuation

#defining the function to remove punctuation
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

#storing the puntuation free text
translation_dataset['en'] = translation_dataset['en'].apply(lambda x:remove_punctuation(x))
translation_dataset['fr'] = translation_dataset['fr'].apply(lambda x:remove_punctuation(x))
translation_dataset.head()


Unnamed: 0,en,fr
0,Jamaica “I am HIV”,Jamaïque J’ai le VIH
1,Its widely acknowledged in the Caribbean and e...,Il est largement reconnu dans les Caraïbes et ...
2,For this woman however photographed in the str...,Pour cette femme cependant photographiée dans ...
3,As Bacon writes on her blog,Comme Bacon écrit sur son blog
4,“When I asked to take her picture I suggested ...,“Quand je lui ai demandé de la prendre en phot...


## Lower Case / Tokenization

In [6]:
# change text to lower case
translation_dataset['en'] = translation_dataset['en'].apply(lambda x: x.lower())
translation_dataset['fr'] = translation_dataset['fr'].apply(lambda x: x.lower())

# tokenize text
translation_dataset['tokenized_en'] = translation_dataset.apply(lambda row: nltk.word_tokenize(row['en']), axis=1)
translation_dataset['tokenized_fr'] = translation_dataset.apply(lambda row: nltk.word_tokenize(row['fr']), axis=1)


In [7]:
translation_dataset.head()


Unnamed: 0,en,fr,tokenized_en,tokenized_fr
0,jamaica “i am hiv”,jamaïque j’ai le vih,"[jamaica, “, i, am, hiv, ”]","[jamaïque, j, ’, ai, le, vih]"
1,its widely acknowledged in the caribbean and e...,il est largement reconnu dans les caraïbes et ...,"[its, widely, acknowledged, in, the, caribbean...","[il, est, largement, reconnu, dans, les, caraï..."
2,for this woman however photographed in the str...,pour cette femme cependant photographiée dans ...,"[for, this, woman, however, photographed, in, ...","[pour, cette, femme, cependant, photographiée,..."
3,as bacon writes on her blog,comme bacon écrit sur son blog,"[as, bacon, writes, on, her, blog]","[comme, bacon, écrit, sur, son, blog]"
4,“when i asked to take her picture i suggested ...,“quand je lui ai demandé de la prendre en phot...,"[“, when, i, asked, to, take, her, picture, i,...","[“, quand, je, lui, ai, demandé, de, la, prend..."


## Sequences

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer


def get_word_index(column):
  """
  Args: colun - column to get word index on
  """
  tokenizer = Tokenizer(num_words=100, oov_token='<OOV>')
  tokenizer.fit_on_texts(column.tolist())
  word_index = tokenizer.word_index
  sequences = tokenizer.texts_to_sequences(column.tolist())
  print(f'Word index: {word_index}')
  print(f'\nSequences: {sequences}')
  return word_index, sequences


# english
print(f'ENGLISH')
en_word_idx, en_sequences = get_word_index(translation_dataset['en'])


# french
print(f'FRENCH')
fr_word_idx, fr_sequences = get_word_index(translation_dataset['fr'])


ENGLISH


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Padding

In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences


def add_padding(column, sequences):
  '''
  Args: column - column to be padded
  '''
  lengths_of_rows = [len(row) for row in column.tolist()]
  max_len = max(lengths_of_rows)
  padded_seqs = pad_sequences(sequences, maxlen=max_len, padding='post',)
  print(f'Padded sequences: {padded_seqs}')
  return padded_seqs


# english
print(f'ENGLISH')
en_padded_seqs = add_padding(column=translation_dataset['tokenized_en'], sequences=en_sequences)

# french
print(f'FRENCH')
fr_padded_seqs = add_padding(column=translation_dataset['tokenized_fr'], sequences=fr_sequences)


ENGLISH
Padded sequences: [[ 1  1  1 ...  0  0  0]
 [40  1  1 ...  0  0  0]
 [11 17  1 ...  0  0  0]
 ...
 [58 21  1 ...  0  0  0]
 [ 1  1 11 ...  0  0  0]
 [32 80  1 ...  0  0  0]]
FRENCH
Padded sequences: [[ 1  1  4 ...  0  0  0]
 [25 19  1 ...  0  0  0]
 [14 38  1 ...  0  0  0]
 ...
 [ 1 76  1 ...  0  0  0]
 [65 41  7 ...  0  0  0]
 [ 1  1 65 ...  0  0  0]]


In [10]:

# sequences=tokenizer.texts_to_sequences(sentences)
# padded=pad_sequences(sequences,padding="post",truncating=”post”,maxlen=8)

## Vectorization

### Count Vectorization

### N-grams




### TF-IDF

### Word2vec

### Bag of words

# References



*   https://medium.com/@aiandbibliophile/machine-translation-english-to-french-translation-using-recurrent-neural-networks-90f8b9e1635e
*  NLP with tensorflow: https://www.youtube.com/watch?v=qw7rkwsk0oc
* https://www.kaggle.com/code/kelde9/tutorial-preprocessing-nlp-english-french-text



In [11]:
# # ngrams
# from nltk.util import ngrams
# import collections

# trigrams = ngrams(word_tokenize(df['feedback_clean2'].sum()), 3)
# trigrams_freq = collections.Counter(trigrams)
# trigrams_freq.most_common(10)

In [12]:
def clean_title(text):
   text = "".join([word.lower() for word in text if word not in            string.punctuation])
   title = re.split('\W+', text)
   text = [ps.stem(word) for word in title if word not in nltk.corpus.stopwords.words('english')]
   return text
count_vectorize = CountVectorizer(analyzer=clean_title)
vectorized = count_vectorize.fit_transform(news['title'])

NameError: ignored

In [None]:
%pip install zeugma

In [None]:
# word_embedding
from zeugma import TextsToSequences

sequencer = TextsToSequences()
embedded_sequ = sequencer.fit_transform(finished_translation_df)
embedded_sequ[0]

In [None]:
embedded_sequ

In [None]:
# normalize data
from keras.preprocessing import sequence
max_len = 40

pad_sequ = sequence.pad_sequences(embedded_sequ, maxlen=max_len)

In [None]:
# couche embeding
longueur_dict = max(list(map(lambda x: max(x), pad_sequ)))+1
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

model = Sequential()
model.add(Embedding(longueur_dict, 8, input_length = max_len))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

In [None]:
# word2 vec embedding

tokenize_sentences = []

for i in range(len(translation_dataset)):
    tokenize_sentences.append(nltk.tokenize.word_tokenize(translation_dataset[i]))

from gensim.test.utils import common_texts
from gensim.models import Word2Vec

def word2vecer(text):
    model_W2V = Word2Vec(sentences=text, size=100, window=5, min_count=1, workers=4)
    model_W2V.train(tokenize_sentences, total_examples=len(tokenize_sentences), epochs=50)
    return

model_W2V.similar_by_word(tokenize_sentences[0][1])[:5]

In [None]:
# visualization
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

def display_closestwords_tsnescatterplot_perso(model, word):
    arr = np.empty((0,100), dtype='f')
    word_labels = [word]

    numb_sim_words = 5

    # get close words
    close_words = model.similar_by_word(word)[:numb_sim_words]

    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)

    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]

    # color for words
    color = ['red']
    for i in range(numb_sim_words):
        color.append('blue')

    # display scatter plot
    plt.scatter(x_coords, y_coords, c = color)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(1, 5), textcoords='offset points')
    plt.xlim(min(x_coords)-100, max(x_coords)+100)
    plt.ylim(min(y_coords)-100, max(y_coords)+100)
    plt.show()

    print("Word most similar to : "+word)
    print([sim_word[0] for sim_word in close_words])