<a href="https://colab.research.google.com/github/YvYh/FluxWeb/blob/main/Model_Controverses2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BQ**: Get/Load Data

In [1]:
from google.cloud import bigquery
json_path = 'poc-bigdata.json'
bigquery_client = bigquery.Client.from_service_account_json(json_path)

In [2]:
def get_bq_data(query):
  query_job = bigquery_client.query(query)
  rows = query_job.result()
  data = rows.to_dataframe()
  return data

In [248]:
def bq_load_df(name, df):
    dataset_ref = bigquery_client.dataset('FluxWeb_Prediction')
    table_ref = dataset_ref.table(name)
    
    
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
    if name == 'Controverses_bert':
      job_config.schema = get_bigquery_schema()
    else:
      job_config.autodetect=True
 
    load_job = bigquery_client.load_table_from_dataframe(
        df,
        table_ref,
        job_config=job_config)
 
    assert load_job.job_type == 'load'
 
    load_job.result()  # Waits for table load to complete.
 
    assert load_job.state == 'DONE'
    print('table {} load {} data.'.format(name, len(df)))

# **TensorFlow**:  TextVectorization

### Get Input Data

In [3]:
import numpy as np
import pandas as pd

In [4]:
 """SELECT DISTINCT NumControverse as label, Titre as Text
FROM `poc-bigdata.FluxWeb_Prediction.Controverses_prd`
UNION ALL 
SELECT DISTINCT NumControverse as label, Commentaire as Text
FROM `poc-bigdata.FluxWeb_Prediction.Controverses_prd`
WHERE Commentaire is not null and length(Commentaire)>5
UNION ALL 
SELECT DISTINCT NumControverse as label, Informations as Text
FROM `poc-bigdata.FluxWeb_Prediction.Controverses_prd`
WHERE Informations is not null
"""
q="SELECT label, Text FROM `poc-bigdata.FluxWeb_Prediction.Controverse_dataset`"
controverses = get_bq_data(q)

In [None]:
#controverses['Text_clean']=controverses.Text.apply(text_preprossing)
controverses = controverses[['label','Text']]

In [5]:
print(len(controverses))
print(len(controverses.label.unique()))
controverses.head()

11537
1408


Unnamed: 0,label,Text
0,2866,Révélation d'un scandale comptable survenu en ...
1,389,EPR de Flamanville: malfaçons
2,2543,Condamnation suite à des déversements dans des...
3,3506,Poursuites aux Etats-Unis en lien avec les émi...
4,3016,Incendie dans une usine de Lubrizol à Rouen


In [196]:
controverses.label.sort_values().unique()

array([  23,   57,   64, ..., 3613, 3614, 3615])

In [200]:
label_index = {k: v for v, k in enumerate(controverses.label.sort_values().unique())}
controverses.label = controverses.label.map(lambda x: label_index.get(x))

In [201]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(controverses, test_size=0.04)
train, val = train_test_split(train, test_size=0.1)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

9967 train examples
1108 validation examples
462 test examples


In [9]:
train.head()

Unnamed: 0,label,Text
7622,3571,Tennet a répondu qu'il est impossible de fixer...
2581,2777,La Cour Suprême en Ukraine a rejeté l'appel de...
3469,2508,Ce n'est pas seulement Amazon qui pose problèm...
9925,3049,Les cinq principaux prêteurs identifiés dans l...
3366,2545,Ils ont indiqué qu'ils aimeraient résoudre ce ...


In [10]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  #dataframe = dataframe.copy()
  labels = dataframe.pop('label')
  ds = tf.data.Dataset.from_tensor_slices((dataframe.values, labels.values))
  #ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [202]:
BUFFER_SIZE = 10000
BATCH_SIZE = 32
train_ds = df_to_dataset(train)
test_ds = df_to_dataset(test)
val_ds= df_to_dataset(val)
train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
test_ds = test_ds.prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.prefetch(tf.data.AUTOTUNE)

In [203]:
for example, label in train_ds.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [[b"Un porte-parole d'Hindustan Unilever a admis avoir achet\xc3\xa9 du th\xc3\xa9 en 2018 dans deux plantations non accr\xc3\xa9dit\xc3\xa9es par la Rainforest Alliance dans l'\xc3\x89tat indien d'Assam"]
 [b"De plus en plus de donn\xc3\xa9es (26 421 plaintes de patients) indiquent que les pompes \xc3\xa0 insuline Medtronic MiniMed 600 Series auraient eu des dysfonctionnements techniques majeurs, blessant plus de 2000 personnes et causant la mort d'un patient"]
 [b"N\xc3\xa9anmoins pour l'instant, le groupe a r\xc3\xa9agi et ces probl\xc3\xa8mes semblent limit\xc3\xa9s."]]

labels:  [ 624 1082  392]


### Spacy

In [None]:
!pip install -U spacy
!python -m spacy download fr_core_news_lg

In [25]:
import spacy
nlp = spacy.load("fr_core_news_lg")

In [None]:
def text_preprossing(text):
    tokens = nlp(text)
    clean = []
    for token in tokens:
        if not any([token.is_digit, token.is_punct, token.is_currency,
               token.like_url, token.like_num, token.like_email,
               token.is_stop]):
            clean += [token.lemma_]
    return tf.strings.join(clean, separator=' ')
    #return ' '.join(clean)

-------  
### Text preprocessing

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
text_list = train.Text.values.tolist()

In [None]:
vectorize_layer.adapt(text_list)

In [None]:
vectorize_layer.get_vocabulary()

In [None]:
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [None]:

embedding_dim=1000

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_ds, 
          epochs=40, 
          validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f5a0eef3910>

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_5 (TextVe (None, 10000)             0         
_________________________________________________________________
embedding (Embedding)        (None, 10000, 16)         160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.0


#### tfidf model

In [None]:
def text_preprossing2(text):
    tokens = nlp(text)
    clean = []
    for token in tokens:
        if not any([token.is_digit, token.is_punct, token.is_currency,
               token.like_url, token.like_num, token.like_email,
               token.is_stop]):
            clean += [token.lemma_]
    return tf.strings.join(clean, separator=' ')
    #return ' '.join(clean)

In [None]:
vectorize_layer2 = TextVectorization(
    #standardize= text_preprossing,
    standardize='lower_and_strip_punctuation',
    max_tokens=vocab_size,
    output_mode='tf-idf',
    #output_sequence_length=sequence_length
    )

In [None]:
vectorize_layer2.adapt(data.Text.values.tolist())

In [None]:
tfidf = tf.keras.models.Sequential()
tfidf.add(tf.keras.Input(shape=(1,),dtype=tf.string))
tfidf.add(vectorize_layer2)



In [None]:
tfidf.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
tfidf.fit(train.Text.values.tolist(), 
          epochs=40, 
          validation_data=val.Text.values.tolist())

In [None]:
print(test.Text.values[0])
tfidf.predict([test.Text.values[0]])

US Equal Employment Opportunity commission EEOC intenter action justice tribunal fédéral NY contre Walmart enfreindre loi fédéral laisser employé masculin harceler sexuellement collègue travail prendre mesure cesse lieu mettre fin harcèlement Walmart demander employé harceler défendre contraindre salarié démissionner agence fédéral affirme walmart recevoir plainte sujet comportement harcelant homme partir entreprise prendre mesure efficace faire cesser harcèlement eeoc réclame arriéré salaire dommage intérêt compensatoire dommage intérêt punitif employé concerner mesure redressement prévenir potentiel cas harcèlement sexuel


array([[6.586464, 0.      , 0.      , ..., 0.      , 0.      , 0.      ]],
      dtype=float32)

In [None]:
tfidf.save('tf_idf_model')
#reloaded_model = tf.keras.models.load_model('my_pet_classifier')

INFO:tensorflow:Assets written to: tf_idf_model/assets


In [None]:
news =''

input_dict = {'Text': tf.convert_to_tensor([news])}
predictions = reloaded_model.predict(input_dict)
prob = tf.nn.sigmoid(predictions[0])

print(
    "This particular pet had a %.1f percent probability "
    "of getting adopted." % (100 * prob)
)

### Text Classification

In [145]:
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow.keras import losses
from tensorflow.keras import preprocessing
from tensorflow.keras import utils
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [156]:
train_text = train_ds.map(lambda text,label: text)
train_text

<MapDataset shapes: (None, 1), types: tf.string>

In [None]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(10):
    print("Text: ", text_batch.numpy()[i])
    print("Label:", label_batch.numpy()[i])

In [None]:
!pip install -U spacy
!python -m spacy download fr_core_news_lg

In [151]:
import spacy
nlp = spacy.load("fr_core_news_lg")

In [178]:
def get_text(t: tf.Tensor):
  return t.numpy().decode('utf-8')
def text_preprossing(text: np.ndarray):
  result = []
  for t in text:
    tokens = nlp(t.decode('utf-8'))
    clean = []
    for token in tokens:
        if not any([token.is_digit, token.is_punct, token.is_currency,
              token.like_url, token.like_num, token.like_email,
              token.is_stop]):
            clean += [token.lemma_]
    #result = result+ [[tf.strings.join(clean, separator=' ')]]
    result = result+ [[' '.join(clean)]]
    #return ' '.join(clean)
    print(result)
  return tf.constant(result)
def clean_text(t: tf.Tensor):
  return tf.py_function(func=text_preprossing, inp=[t], Tout=tf.string)

In [129]:
tf.config.run_functions_eagerly(False)

In [181]:
test_text = test_ds.map(lambda text,label: text)
test = next(iter(test_text))
print(test)
clean_text(test)

tf.Tensor(
[[b"Ces enqu\xc3\xaates portaient sur l'embauche d'enfants de personnalit\xc3\xa9s chinoises et visaient plusieurs banques, soup\xc3\xa7onn\xc3\xa9es de les avoir recrut\xc3\xa9s, dans l'espoir d'obtenir des contrats"]
 [b"Apr\xc3\xa8s des \xc3\xa9meutes en 2016, d'autres violations ont eu lieu dans ce centre mais dans d'autres \xc3\xa9galement"]
 [b'Quatre groupes indig\xc3\xa8nes ont demand\xc3\xa9 \xc3\xa0 Frontera de nettoyer les dommages environnementaux li\xc3\xa9s aux d\xc3\xa9versements de p\xc3\xa9trole du Lot 192, le plus grand puits de p\xc3\xa9trole du P\xc3\xa9rou']
 [b"Edison a vers\xc3\xa9 500 $ par habitant et une enqu\xc3\xaate est en cours pour d\xc3\xa9terminer les causes de l'accident"]
 [b"La CFTC (US Commodity Futures Trading Commission) avait d\xc3\xa9pos\xc3\xa9 plainte contre Kraft Foods et Mondelez en Illinois en avril 2015, all\xc3\xa9guant que les soci\xc3\xa9t\xc3\xa9s avaient manipul\xc3\xa9 les prix du bl\xc3\xa9 (options) et du bl\xc3\xa9 (fut

UnknownError: ignored

In [163]:
vocab_size = 10000
sequence_length = 600

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    standardize= clean_text,
    #standardize='lower_and_strip_punctuation',
    max_tokens=vocab_size,
    output_mode='tf-idf',
    #output_sequence_length=sequence_length
    )

In [164]:
vectorize_layer.adapt(train_text)

TypeError: ignored

### Binary classification

In [182]:
VOCAB_SIZE = 10000

binary_vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='binary')
train_text = train_ds.map(lambda text, labels: text)
binary_vectorize_layer.adapt(train_text)

In [188]:
def binary_vectorize_text(text, label):
  #text = tf.expand_dims(text, -1)
  return binary_vectorize_layer(text), label

In [184]:
# Retrieve a batch (of 32 reviews and labels) from the dataset
text_batch, label_batch = next(iter(train_ds))
first_text, first_label = text_batch[0], label_batch[0]
print("Text", first_text)
print("Label", first_label)

Text tf.Tensor([b'D\xc3\xa8s lors le groupe devra payer 50000? par emploi non cr\xc3\xa9\xc3\xa9.'], shape=(1,), dtype=string)
Label tf.Tensor(1623, shape=(), dtype=int64)


In [185]:
print("'binary' vectorized question:", 
      binary_vectorize_text(first_text, first_label)[0])

'binary' vectorized question: tf.Tensor([[0. 0. 0. ... 0. 0. 0.]], shape=(1, 10000), dtype=float32)


In [204]:
binary_train_ds = train_ds.map(binary_vectorize_text)
binary_val_ds = val_ds.map(binary_vectorize_text)
binary_test_ds = test_ds.map(binary_vectorize_text)
AUTOTUNE = tf.data.AUTOTUNE

In [205]:
def configure_dataset(dataset):
  return dataset.cache().prefetch(buffer_size=AUTOTUNE)
binary_train_ds = configure_dataset(binary_train_ds)
binary_val_ds = configure_dataset(binary_val_ds)
binary_test_ds = configure_dataset(binary_test_ds)
binary_model = tf.keras.Sequential([layers.Dense(1408)])
binary_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])
history = binary_model.fit(
    binary_train_ds, validation_data=binary_val_ds, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [206]:
print("Linear model on binary vectorized data:")
print(binary_model.summary())

Linear model on binary vectorized data:
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_5 (Dense)              (None, 1408)              14081408  
Total params: 14,081,408
Trainable params: 14,081,408
Non-trainable params: 0
_________________________________________________________________
None


In [207]:
binary_loss, binary_accuracy = binary_model.evaluate(binary_test_ds)
print("Binary model accuracy: {:2.2%}".format(binary_accuracy))

Binary model accuracy: 15.37%


In [208]:
export_model = tf.keras.Sequential(
    [binary_vectorize_layer, binary_model,
     layers.Activation('sigmoid')])

export_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=False),
    optimizer='adam',
    metrics=['accuracy'])

# Test it with `raw_test_ds`, which yields raw strings
loss, accuracy = export_model.evaluate(test_ds)
print("Accuracy: {:2.2%}".format(binary_accuracy))

Accuracy: 15.37%


In [229]:
def get_string_labels(predicted_scores_batch):
  predicted_ints = tf.argmax(predicted_scores_batch, axis=1).numpy()
  return [[k for k,v in label_index.items() if v == i][0] for i in predicted_ints]

In [228]:
predicted_int_labels.numpy()

array([16])

In [227]:
[k for k,v in label_index.items() if v == 16][0]

389

In [232]:
inputs=["EPR de Flamanville: malfaçons",
        "Redressement fiscal des GAFAM en France"]
predicted_scores = export_model.predict(inputs)
predicted_labels = get_string_labels(predicted_scores)
for input, label in zip(inputs, predicted_labels):
  print("Text: ", input)
  print("Predicted label: ", label)

Text:  EPR de Flamanville: malfaçons
Predicted label:  389
Text:  Redressement fiscal des GAFAM en France
Predicted label:  2019


In [233]:
export_model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_16 (TextV (None, 10000)             0         
_________________________________________________________________
sequential_4 (Sequential)    (None, 1408)              14081408  
_________________________________________________________________
activation (Activation)      (None, 1408)              0         
Total params: 14,081,408
Trainable params: 14,081,408
Non-trainable params: 0
_________________________________________________________________


In [234]:
export_model.save('binaryClassif')

INFO:tensorflow:Assets written to: binaryClassif/assets


In [235]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [246]:
pd.DataFrame({'label':label_index.values(), 'numControverse':label_index.keys()})

Unnamed: 0,label,numControverse
0,0,23
1,1,57
2,2,64
3,3,69
4,4,77
...,...,...
1403,1403,3611
1404,1404,3612
1405,1405,3613
1406,1406,3614


In [249]:
bq_load_df("LabelControv", pd.DataFrame({'label':label_index.values(), 'numControverse':label_index.keys()}))

table LabelControv load 1408 data.
