<a href="https://colab.research.google.com/github/YvYh/FluxWeb/blob/main/Model_Controverses.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BQ**: Get/Load Data

In [1]:
from google.cloud import bigquery
json_path = 'poc-bigdata.json'
bigquery_client = bigquery.Client.from_service_account_json(json_path)

In [2]:
def get_bq_data(query):
  query_job = bigquery_client.query(query)
  rows = query_job.result()
  data = rows.to_dataframe()
  return data

In [3]:
def bq_load_df(name, df):
    dataset_ref = bigquery_client.dataset('FluxWeb_Prediction')
    table_ref = dataset_ref.table(name)
    
    
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
    if name == 'Controverses_bert':
      job_config.schema = get_bigquery_schema()
    else:
      job_config.autodetect=True
 
    load_job = bigquery_client.load_table_from_dataframe(
        df,
        table_ref,
        job_config=job_config)
 
    assert load_job.job_type == 'load'
 
    load_job.result()  # Waits for table load to complete.
 
    assert load_job.state == 'DONE'
    print('table {} load {} data.'.format(name, len(df)))

# **TensorFlow**:  TextVectorization

### Get Input Data

In [4]:
import numpy as np
import pandas as pd

In [5]:
q = """SELECT DISTINCT NumControverse as label, Titre as Text
FROM `poc-bigdata.FluxWeb_Prediction.Controverses_prd`
UNION ALL 
SELECT DISTINCT NumControverse as label, Commentaire as Text
FROM `poc-bigdata.FluxWeb_Prediction.Controverses_prd`
WHERE Commentaire is not null and length(Commentaire)>5
UNION ALL 
SELECT DISTINCT NumControverse as label, Informations as Text
FROM `poc-bigdata.FluxWeb_Prediction.Controverses_prd`
WHERE Informations is not null
"""
controverses = get_bq_data(q)

In [6]:
#controverses['Text_clean']=controverses.Text.apply(text_preprossing)
controverses = controverses[['label','Text']]

In [7]:
print(len(controverses))
print(len(controverses.label.unique()))
controverses.head()

3751
1408


Unnamed: 0,label,Text
0,389,Le directeur du chantier de Flamanville pour E...
1,1423,Après les deux premières plaintes déposées à L...
2,1695,"Le fisc français réclamerait 1,6 Md? d'arriéré..."
3,3561,Un ancien directeur des comptes nationaux de P...
4,2912,Suite article du Wall Street Journal du 16/05/...


### Text classification with an RNN

In [29]:
import numpy as np

import tensorflow_datasets as tfds
from tensorflow.keras import Sequential
import tensorflow as tf
tfds.disable_progress_bar()

In [10]:
import matplotlib.pyplot as plt


def plot_graphs(history, metric):
  plt.plot(history.history[metric])
  plt.plot(history.history['val_'+metric], '')
  plt.xlabel("Epochs")
  plt.ylabel(metric)
  plt.legend([metric, 'val_'+metric])

In [22]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(controverses, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

2400 train examples
600 validation examples
751 test examples


In [23]:
train.head()

Unnamed: 0,label,Text
1934,2445,L'UFC-Que Choisir vient d'annoncer le dépôt de...
3372,2843,L'autorité suédoise de surveillance financière...
166,2673,Détournements de fonds de plusieurs filiales s...
85,3250,"Un tribunal de la province de Hebei, dans le n..."
3016,2836,Les enquêteurs ont remis au bureau du procureu...


In [24]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  #dataframe = dataframe.copy()
  labels = dataframe.pop('label')
  ds = tf.data.Dataset.from_tensor_slices((dataframe.values, labels.values))
  #ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [25]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64
train_dataset = df_to_dataset(train)
test_dataset = df_to_dataset(test)
train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
test_dataset = test_dataset.prefetch(tf.data.AUTOTUNE)

In [26]:
for example, label in train_dataset.take(1):
  print('texts: ', example.numpy()[:3])
  print()
  print('labels: ', label.numpy()[:3])

texts:  [[b"Les Amis de la Terre d\xc3\xa9noncent la Soci\xc3\xa9t\xc3\xa9 G\xc3\xa9n\xc3\xa9rale pour son soutien aux projets d'exportation de gaz de schiste liqu\xc3\xa9fi\xc3\xa9 Rio Grande LNG et de double gazoduc Rio Bravo Pipeline au Texas. La banque servirait en effet de conseiller financier \xc3\xa0 NextDecade, soci\xc3\xa9t\xc3\xa9 portant le projet. L'ONG d\xc3\xa9nonce ainsi les impacts n\xc3\xa9fastes de ces projets sur l'environnement et les communaut\xc3\xa9s locales, qui iraient l'encontre des Principes Equateur dont la banque est signataire. Le projet Rio Grande serait notamment localis\xc3\xa9 au coeur d'une r\xc3\xa9serve naturelle, et les processus de consultation des communaut\xc3\xa9s locales seraient d\xc3\xa9ficients. Rappelons que les engagements pris par la banque sur la question des \xc3\xa9nergies non conventionnelles avaient \xc3\xa9t\xc3\xa9 jug\xc3\xa9s l\xc3\xa9gers par l'ONG, en comparaison notamment \xc3\xa0 son homologue BNP Paribas."]
 [b'BMW a d\xc3\

In [27]:
VOCAB_SIZE = 5000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(
    max_tokens=VOCAB_SIZE)
encoder.adapt(train_dataset.map(lambda text, label: text))

In [28]:
vocab = np.array(encoder.get_vocabulary())
vocab[:20]

array(['', '[UNK]', 'de', 'la', 'des', 'et', 'le', 'les', 'à', 'en', 'a',
       'pour', 'du', 'dans', 'une', 'par', 'un', 'sur', 'que', 'au'],
      dtype='<U20')

In [30]:
model = tf.keras.Sequential([
    encoder,
    tf.keras.layers.Embedding(
        input_dim=len(encoder.get_vocabulary()),
        output_dim=64,
        # Use masking to handle the variable sequence lengths
        mask_zero=True),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])

In [31]:
print([layer.supports_masking for layer in model.layers])

[False, True, True, True, True]


In [32]:
sample_text = ('The movie was cool. The animation and the graphics '
               'were out of this world. I would recommend this movie.')
predictions = model.predict(np.array([sample_text]))
print(predictions[0])

[0.01629866]


In [33]:
padding = "the " * 2000
predictions = model.predict(np.array([sample_text, padding]))
print(predictions[0])

[0.01629867]


In [34]:
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [35]:
val_dataset = df_to_dataset(val)
history = model.fit(train_dataset, epochs=10,
                    validation_data=test_dataset,
                    validation_steps=30)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


### SpaCy

In [None]:
!pip install -U spacy
!python -m spacy download fr_core_news_lg

In [None]:
import spacy
nlp = spacy.load("fr_core_news_lg")

In [None]:
def text_preprossing(text):
    tokens = nlp(text)
    clean = []
    for token in tokens:
        if not any([token.is_digit, token.is_punct, token.is_currency,
               token.like_url, token.like_num, token.like_email,
               token.is_stop]):
            clean += [token.lemma_]
    #return tf.strings.join(clean, separator=' ')
    return ' '.join(clean)

In [None]:
data = pd.DataFrame()
data['label'] = controverses.label
data['Text'] = controverses.Text.apply(text_preprossing)
data.head()

Unnamed: 0,label,Text
0,2866,révélation scandale comptable survenir impliqu...
1,389,epr Flamanville malfaçon
2,2543,condamnation suite déversement rivière
3,3506,poursuite Etats-Unis lien émission no niveau c...
4,3016,incendie usine Lubrizol Rouen


In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2)
train, val = train_test_split(train, test_size=0.2)
print(len(train), 'train examples')
print(len(val), 'validation examples')
print(len(test), 'test examples')

2400 train examples
600 validation examples
751 test examples


In [None]:
val.head()

Unnamed: 0,label,Text
1212,69,procédure cours cadre fait avérer corruption
3551,3178,ancien avocat filiale conglomérat Danaher Corp...
1040,3370,reprise opération Carnival Etats-Unis
2858,2604,syndicat approuver octobre accord proposer pdg...
69,1407,rapport critique institution financier finance...


In [None]:
# A utility method to create a tf.data dataset from a Pandas Dataframe
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
  #dataframe = dataframe.copy()
  #labels = dataframe.pop('label')
  ds = tf.data.Dataset.from_tensor_slices((dataframe.Text.values, dataframe.label.values))
  #ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
  if shuffle:
    ds = ds.shuffle(buffer_size=len(dataframe))
  ds = ds.batch(batch_size)
  return ds

In [None]:
batch_size = 32 # A small batch sized is used for demonstration purposes
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)
train_ds

<BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int64)>

In [None]:
for text_batch, label_batch in train_ds.take(1):
  for i in range(5):
    print(label_batch[i].numpy(), text_batch.numpy()[i])

3319 b'enqu\xc3\xaate conduire ICIJ m\xc3\xa9dia montre grand banque poreux blanchiment argent base suspiciou activity report SAR transmettre banque autorit\xc3\xa9 lutte antiblanchiment am\xc3\xa9ricain fincen rapport ultraconfidentiels repr\xc3\xa9senter total mds$ transaction suspect r\xc3\xa9aliser montrer banque circuler passivement travers compte bancaire personne soci\xc3\xa9t\xc3\xa9 identifier argent susceptible relever blanchiment aucun banque solliciter souhaiter r\xc3\xa9agir fincen proposer septembre vaste r\xc3\xa9forme cens\xc3\xa9 am\xc3\xa9liorer efficacit\xc3\xa9 lutte anti-blanchiment renforcer obligation d\xc3\xa9claration banque   '
3460 b'critique politique \xc3\xa9conomique chinois fondateur cons\xc3\xa9quence'
1330 b'ong n\xc3\xa9erlandais milieudefensie enqu\xc3\xaater fournisseur huile palme Neste neste acheter mt distillat acide gras palme pfad d\xc3\xa9chet production huile palme principalement malaisie indon\xc3\xa9sie ong accuse Neste destruction for\xc3\x

In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

-------  
### Text preprocessing

In [None]:
import tensorflow as tf
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

In [None]:
vocab_size = 10000
sequence_length = 600

# Use the text vectorization layer to normalize, split, and map strings to
# integers. Set output_sequence_length length to pad all samples to same length.
vectorize_layer = TextVectorization(
    #standardize= text_preprossing,
    standardize='lower_and_strip_punctuation',
    max_tokens=vocab_size,
    output_mode='tf-idf',
    #output_sequence_length=sequence_length
    )

In [None]:
text_list = train.Text.values.tolist()

In [None]:
vectorize_layer.adapt(text_list)

In [None]:
vectorize_layer.get_vocabulary()

In [None]:
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
embedding_layer = tf.keras.layers.Embedding(1000, 5)

In [None]:

embedding_dim=1000

model = Sequential([
  vectorize_layer,
  Embedding(vocab_size, embedding_dim, name="embedding"),
  GlobalAveragePooling1D(),
  Dense(16, activation='relu'),
  Dense(1)
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
model.fit(train_ds, 
          epochs=40, 
          validation_data=val_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f5a0eef3910>

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_5 (TextVe (None, 10000)             0         
_________________________________________________________________
embedding (Embedding)        (None, 10000, 16)         160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0
_________________________________________________________________


In [None]:
loss, accuracy = model.evaluate(test_ds)
print("Accuracy", accuracy)

Accuracy 0.0


#### tfidf model

In [None]:
def text_preprossing2(text):
    tokens = nlp(text)
    clean = []
    for token in tokens:
        if not any([token.is_digit, token.is_punct, token.is_currency,
               token.like_url, token.like_num, token.like_email,
               token.is_stop]):
            clean += [token.lemma_]
    return tf.strings.join(clean, separator=' ')
    #return ' '.join(clean)

In [None]:
vectorize_layer2 = TextVectorization(
    #standardize= text_preprossing,
    standardize='lower_and_strip_punctuation',
    max_tokens=vocab_size,
    output_mode='tf-idf',
    #output_sequence_length=sequence_length
    )

In [None]:
vectorize_layer2.adapt(data.Text.values.tolist())

In [None]:
tfidf = tf.keras.models.Sequential()
tfidf.add(tf.keras.Input(shape=(1,),dtype=tf.string))
tfidf.add(vectorize_layer2)



In [None]:
tfidf.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])
tfidf.fit(train.Text.values.tolist(), 
          epochs=40, 
          validation_data=val.Text.values.tolist())

Epoch 1/40


AttributeError: ignored

In [None]:
print(test.Text.values[0])
tfidf.predict([test.Text.values[0]])

US Equal Employment Opportunity commission EEOC intenter action justice tribunal fédéral NY contre Walmart enfreindre loi fédéral laisser employé masculin harceler sexuellement collègue travail prendre mesure cesse lieu mettre fin harcèlement Walmart demander employé harceler défendre contraindre salarié démissionner agence fédéral affirme walmart recevoir plainte sujet comportement harcelant homme partir entreprise prendre mesure efficace faire cesser harcèlement eeoc réclame arriéré salaire dommage intérêt compensatoire dommage intérêt punitif employé concerner mesure redressement prévenir potentiel cas harcèlement sexuel


array([[6.586464, 0.      , 0.      , ..., 0.      , 0.      , 0.      ]],
      dtype=float32)

In [None]:
tfidf.save('tf_idf_model')
#reloaded_model = tf.keras.models.load_model('my_pet_classifier')

INFO:tensorflow:Assets written to: tf_idf_model/assets


In [None]:
news =''

input_dict = {'Text': tf.convert_to_tensor([news])}
predictions = reloaded_model.predict(input_dict)
prob = tf.nn.sigmoid(predictions[0])

print(
    "This particular pet had a %.1f percent probability "
    "of getting adopted." % (100 * prob)
)