Imports

In [None]:
import tensorflow as tf
import pathlib
import pandas as pd
import numpy as np
import os
import io
import warnings
warnings.filterwarnings('ignore')

from collections import Counter
import re



from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D, SimpleRNN, GRU, LSTM
from tensorflow.keras.losses import MeanSquaredError, BinaryCrossentropy, CategoricalCrossentropy, SparseCategoricalCrossentropy
from tensorflow.keras.metrics import SparseCategoricalCrossentropy, MeanSquaredError
from tensorflow.keras.optimizers import Adam


#Importation du dataset provenant d'un fichier csv


In [None]:
# Je donne accès à colab à mon drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


création d'un data frame pandas avec le dataset


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Projets_Jedha/Data/spam.csv', encoding = 'latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


# EDA

In [None]:
#on vérifie de la distribution da la variable cible
df['v1'].value_counts()

Unnamed: 0_level_0,count
v1,Unnamed: 1_level_1
ham,4825
spam,747


La variable target n'est pas équilibrée.

On remplace les 2 valeurs target par 0 et 1 pour pouvoir ensuite le traiter comme des variables numériques.

Il s'agit d'un problème de classification binaire.

In [None]:
df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})
df.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,"Go until jurong point, crazy.. Available only ...",,,
1,0,Ok lar... Joking wif u oni...,,,
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,0,U dun say so early hor... U c already then say...,,,
4,0,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
#Checking missing values per column
#Pas de valeurs manquantes dans la targe (v1) et dans la colonne v2
#Moins d'1% des lignes dans les colonnes 'Unnamed 2','Unnamed 3','Unnamed 4' contiennent des valeurs
print("DF's shape: ")
print(df.shape)
print()
print("DF's count: ")
print(df.count())
print()
print("Missing values: ")
print(df.isnull().any())
print()
print("Count of missing values: ")
display(100 * df.isnull().sum())
print()
print("Percentage of missing values: ")
display(100 * df.isnull().sum() / df.shape[0])

DF's shape: 
(5572, 5)

DF's count: 
v1               0
v2            5572
Unnamed: 2      50
Unnamed: 3      12
Unnamed: 4       6
dtype: int64

Missing values: 
v1             True
v2            False
Unnamed: 2     True
Unnamed: 3     True
Unnamed: 4     True
dtype: bool

Count of missing values: 


Unnamed: 0,0
v1,557200
v2,0
Unnamed: 2,552200
Unnamed: 3,556000
Unnamed: 4,556600



Percentage of missing values: 


Unnamed: 0,0
v1,100.0
v2,0.0
Unnamed: 2,99.102656
Unnamed: 3,99.784637
Unnamed: 4,99.892319


# Pre-processing


Pas de valeurs manquantes dans la target (v1) et dans la colonne v2.

Moins d'1% des lignes des colonnes 'Unnamed 2', 'Unnamed 3', 'Unnamed 4' contiennent des valeurs.

In [None]:
dataset = df.iloc[:,:2]
dataset.head()

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
# Remove all non alphanumeric characters except whitespaces
dataset['messages_clean'] = dataset['v2'].apply(lambda x:''.join(ch for ch in x if ch.isalnum() or ch==" "))
# remove double spaces and spaces at the beginning and end of strings
dataset['messages_clean'] = dataset['messages_clean'].apply(lambda x: x.replace(" +"," ").strip())
dataset.head()


Unnamed: 0,v1,v2,messages_clean
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...


Tokenization - Spacy

We will keep stop words in this dataset since we are dealing mostly with short messages that may be spams. Important information may be contained in the stop words. Also special characters like "!!" may be important for detecting spams.


In [None]:
# Import English using en_core_web_sm.load()
import en_core_web_sm
# Import english stop words
#from spacy.lang.en.stop_words import STOP_WORDS
nlp = en_core_web_sm.load()


dataset['messages_tokenized'] = dataset['messages_clean'].apply(lambda x: nlp(x))

dataset['messages_tokenized'] = dataset['messages_tokenized'].apply(lambda x:[token.lemma_ for token in x])

dataset["clean_token"] = [" ".join(x) for x in dataset['messages_tokenized']]


dataset.head()

Unnamed: 0,v1,v2,messages_clean,messages_tokenized,clean_token
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, Joking, wif, u, oni]",ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, F...",free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, I, do, not, think, he, go, to, usf, he, ...",nah I do not think he go to usf he live around...


In [None]:
# Function to tokenize and clean messages
def clean_and_tokenize(text):
    # Lowercase and remove non-alphabetic characters
    words = re.findall(r'\b\w+\b', text.lower())
    return words

# Flatten the list of tokenized words
all_words = [word for message in dataset['clean_token'] for word in clean_and_tokenize(message)]

# Count the frequency of each word
word_counts = Counter(all_words)

# Get the number of unique words
num_unique_words = len(word_counts)
num_unique_words

8563

Tokenizer

In [None]:
vocab_size = 8563

tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=vocab_size) # instanciate the tokenizer

tokenizer.fit_on_texts(dataset["clean_token"])

In [None]:
dataset["messages_encoded"] = tokenizer.texts_to_sequences(dataset.clean_token)

In [None]:
dataset.head()

Unnamed: 0,v1,v2,messages_clean,messages_tokenized,clean_token,messages_encoded
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...",go until jurong point crazy available only in ...,"[21, 416, 3878, 383, 589, 609, 69, 11, 1117, 9..."
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, Joking, wif, u, oni]",ok lar Joking wif u oni,"[50, 304, 3880, 417, 7, 1655]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, F...",free entry in 2 a wkly comp to win FA Cup fina...,"[52, 429, 11, 24, 5, 675, 821, 3, 105, 1656, 1..."
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",u dun say so early hor u c already then say,"[7, 229, 65, 27, 257, 2600, 7, 161, 144, 61, 65]"
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, I, do, not, think, he, go, to, usf, he, ...",nah I do not think he go to usf he live around...,"[866, 1, 9, 8, 72, 40, 21, 3, 822, 40, 233, 20..."


In [None]:
dataset['token_count'] = dataset['messages_encoded'].apply(lambda x: len(str(x).split(' ')))
dataset.head()

Unnamed: 0,v1,v2,messages_clean,messages_tokenized,clean_token,messages_encoded,token_count
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...",go until jurong point crazy available only in ...,"[21, 416, 3878, 383, 589, 609, 69, 11, 1117, 9...",20
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, Joking, wif, u, oni]",ok lar Joking wif u oni,"[50, 304, 3880, 417, 7, 1655]",6
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, F...",free entry in 2 a wkly comp to win FA Cup fina...,"[52, 429, 11, 24, 5, 675, 821, 3, 105, 1656, 1...",28
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",u dun say so early hor u c already then say,"[7, 229, 65, 27, 257, 2600, 7, 161, 144, 61, 65]",11
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, I, do, not, think, he, go, to, usf, he, ...",nah I do not think he go to usf he live around...,"[866, 1, 9, 8, 72, 40, 21, 3, 822, 40, 233, 20...",14


In [None]:
# Flatten the lists in 'messages_encoded' column
flattened_list = [index for sublist in dataset['messages_encoded'] for index in sublist]

# Convert to a set to get unique values and count them
unique_count = len(set(flattened_list))

print("Number of unique values:", unique_count)

Number of unique values: 8562


On vérifie la distribution du nombre de tokens par observation avec .describe()


In [None]:
#On vérifie la distribution du nombre de tokens par observation avec .describe()

dataset['token_count'].describe()

Unnamed: 0,token_count
count,5572.0
mean,15.583632
std,11.289074
min,1.0
25%,7.0
50%,12.0
75%,23.0
max,173.0


In [None]:
dataset['token_count'].nlargest(10)

Unnamed: 0,token_count
1862,173
1084,172
2157,116
2847,116
2433,107
2379,100
2369,96
5104,96
2009,95
4903,94


75% of the dataset contains 23 or less token per observation. We are dealing with short messages that may be spams.

Padding

In [None]:

#on choisit la taille du padding en fonction de la distribution du nombre de tokens par observation avec maxlen

messages_pad = tf.keras.preprocessing.sequence.pad_sequences(dataset.messages_encoded, padding="post", maxlen=173)
messages_pad

array([[  21,  416, 3878, ...,    0,    0,    0],
       [  50,  304, 3880, ...,    0,    0,    0],
       [  52,  429,   11, ...,    0,    0,    0],
       ...,
       [8561,    2,   11, ...,    0,    0,    0],
       [   6,  207,    9, ...,    0,    0,    0],
       [2305,   41,  454, ...,    0,    0,    0]], dtype=int32)

We define x and y through tf.data.Dataset.from_tensor_slices

v1 is the target variable (Y)
messages_pad is the feature (X)

In [None]:
dataset_model = tf.data.Dataset.from_tensor_slices((messages_pad, dataset.v1))


In [None]:
dataset.shape

(5572, 7)

### Train - test split

In [None]:
take_size = int(0.7*dataset.shape[0])


dataset_train = dataset_model.take(take_size).shuffle(take_size, seed=42)
dataset_train = dataset_train.batch(64)

In [None]:
#test_set

dataset_test = dataset_model.skip(take_size)
dataset_test = dataset_model.batch(64)


In [None]:
#On jete un oeil sur notre data
for message, label in dataset_train.take(1):
    print(message, label)

tf.Tensor(
[[ 188 1529  583 ...    0    0    0]
 [  41  149   16 ...    0    0    0]
 [5574  312 5575 ...    0    0    0]
 ...
 [ 112  116   21 ...    0    0    0]
 [  23    4    9 ...    0    0    0]
 [   1   73  559 ...    0    0    0]], shape=(64, 173), dtype=int32) tf.Tensor(
[0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0], shape=(64,), dtype=int64)


### Model 1: dense layers

In [None]:
vocab_size = max([max(sequence) for sequence in messages_pad]) + 1
vocab_size

8563

In [None]:
model1 = Sequential([
  Embedding(vocab_size, 32, name="embedding"), # Couche d'embedding
  GlobalAveragePooling1D(), # Couche nécessaire pour pouvoir ensuite introduire les couches Dense # this will pick the average for every word in the sentence
  # along each dimension of the representation space.
  Dense(8, activation='relu'), # Couche dense
  Dense(1, activation="sigmoid") # Couche dense de prédiction avec une fonction d'activation sigmoide car il s'agit d'un classification binaire (0 ou 1 / non spam ou spam)
])

In [None]:
model1.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

Model 1 training

In [None]:
# Model training
model1.fit(dataset_train,
          epochs=30,
          validation_data=dataset_test)

Epoch 1/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.8132 - loss: 0.4421 - val_accuracy: 0.8659 - val_loss: 0.3796
Epoch 2/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.8591 - loss: 0.3920 - val_accuracy: 0.8659 - val_loss: 0.3766
Epoch 3/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8788 - loss: 0.3521 - val_accuracy: 0.8659 - val_loss: 0.3745
Epoch 4/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8710 - loss: 0.3641 - val_accuracy: 0.8659 - val_loss: 0.3712
Epoch 5/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8621 - loss: 0.3756 - val_accuracy: 0.8659 - val_loss: 0.3657
Epoch 6/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.8750 - loss: 0.3491 - val_accuracy: 0.8659 - val_loss: 0.3603
Epoch 7/30
[1m61/61[0m [32m━━━━━

<keras.src.callbacks.history.History at 0x7c09797e2380>

Model 1 loss across epochs visualization:

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=model1.history.history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=model1.history.history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

Model 1 accuracy across epochs visualization:

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=model1.history.history["accuracy"],
                    mode='lines',
                    name='accuracy'))
fig.add_trace(go.Scatter(y=model1.history.history["val_accuracy"],
                    mode='lines',
                    name='val_accuracy'))
fig.show()

In [None]:
from sklearn.metrics import classification_report


### Predictions on the train and test sets and evaluation (classification report)

In [None]:
# Step 1: Get predictions on the test set
predictions_prob_train = model1.predict(dataset_train)  # model1 is your trained model

predictions_prob_test = model1.predict(dataset_test)  # model1 your trained model

# Step 2: Convert probabilities to class labels
# For binary classification (sigmoid output):
predictions_train = (predictions_prob_train > 0.5).astype(int)
predictions_test = (predictions_prob_test > 0.5).astype(int)


# Step 3: Extract the true labels from the test set
# Assuming your dataset_test is in the form (features, labels), so you get true labels like this:
true_labels_train = np.concatenate([y for x, y in dataset_train], axis=0)


true_labels_test = np.concatenate([y for x, y in dataset_test], axis=0)

# Step 4: Generate the classification report
report_dense_train = classification_report(true_labels_train, predictions_train, target_names=['Class 0', 'Class 1'])
print(report_dense_train)

report_dense_test = classification_report(true_labels_test, predictions_test, target_names=['Class 0', 'Class 1'])
print(report_dense_train)

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
              precision    recall  f1-score   support

     Class 0       0.87      0.88      0.87      3381
     Class 1       0.13      0.12      0.12       519

    accuracy                           0.78      3900
   macro avg       0.50      0.50      0.50      3900
weighted avg       0.77      0.78      0.77      3900

              precision    recall  f1-score   support

     Class 0       0.87      0.88      0.87      3381
     Class 1       0.13      0.12      0.12       519

    accuracy                           0.78      3900
   macro avg       0.50      0.50      0.50      3900
weighted avg       0.77      0.78      0.77      3900



### RNN Model

In [None]:
#Simple RNN model


model_rnn = tf.keras.Sequential([
    Embedding(vocab_size, 32, name="embedding"),  # Embedding layer
    SimpleRNN(units=8, return_sequences=False),  # RNN, only returning the last output
    Dense(4, activation='relu'),  # Dense layer
    Dense(1, activation="sigmoid")  # Output layer for binary classification
])

In [None]:
model_rnn.summary()

In [None]:
model_rnn.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
# Model training
history = model_rnn.fit(dataset_train,
          epochs=30,
          validation_data=dataset_test)

Epoch 1/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 68ms/step - accuracy: 0.8542 - loss: 0.5920 - val_accuracy: 0.8659 - val_loss: 0.3749
Epoch 2/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 90ms/step - accuracy: 0.8698 - loss: 0.3320 - val_accuracy: 0.8659 - val_loss: 0.2621
Epoch 3/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 116ms/step - accuracy: 0.8637 - loss: 0.2331 - val_accuracy: 0.8677 - val_loss: 0.2132
Epoch 4/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 73ms/step - accuracy: 0.8638 - loss: 0.1854 - val_accuracy: 0.9008 - val_loss: 0.1821
Epoch 5/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 56ms/step - accuracy: 0.9058 - loss: 0.1446 - val_accuracy: 0.9149 - val_loss: 0.1595
Epoch 6/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 56ms/step - accuracy: 0.9276 - loss: 0.1214 - val_accuracy: 0.9397 - val_loss: 0.1463
Epoch 7/30
[1m61/61[0m [32m━━

<keras.src.callbacks.history.History at 0x7c098160a050>

### Predictions on the train and test sets and evaluation (classification report)

In [None]:

predictions_prob_train = model_rnn.predict(dataset_train)  # model1 is your trained model

predictions_prob_test = model_rnn.predict(dataset_test)  # model1 your trained model

predictions_train = (predictions_prob_train > 0.5).astype(int)
predictions_test = (predictions_prob_test > 0.5).astype(int)


true_labels_train = np.concatenate([y for x, y in dataset_train], axis=0)


true_labels_test = np.concatenate([y for x, y in dataset_test], axis=0)

report_rnn_train = classification_report(true_labels_train, predictions_train, target_names=['Class 0', 'Class 1'])
print(report_rnn_train)

report_rnn_test = classification_report(true_labels_test, predictions_test, target_names=['Class 0', 'Class 1'])
print(report_rnn_test)

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step
              precision    recall  f1-score   support

     Class 0       0.87      0.87      0.87      3381
     Class 1       0.14      0.14      0.14       519

    accuracy                           0.77      3900
   macro avg       0.51      0.51      0.51      3900
weighted avg       0.77      0.77      0.77      3900

              precision    recall  f1-score   support

     Class 0       0.98      0.99      0.99      4825
     Class 1       0.95      0.88      0.91       747

    accuracy                           0.98      5572
   macro avg       0.96      0.94      0.95      5572
weighted avg       0.98      0.98      0.98      5572



Loss and Accuracy  visualisation RNN model


In [None]:

fig = go.Figure()
fig.add_trace(go.Scatter(y=model_rnn.history.history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=model_rnn.history.history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

In [None]:

fig = go.Figure()
fig.add_trace(go.Scatter(y=model_rnn.history.history["accuracy"],
                    mode='lines',
                    name='accuracy'))
fig.add_trace(go.Scatter(y=model_rnn.history.history["val_accuracy"],
                    mode='lines',
                    name='val_accuracy'))
fig.show()

### GRU model

In [None]:
#On teste un modèle GRU
model_gru = tf.keras.Sequential([
                Embedding(vocab_size, 32, name="embedding"),
                GRU(units=8, return_sequences=False), # returns the last output
                Dense(4, activation='relu'), # a dense layer
                Dense(1, activation="sigmoid")#prediction layer
])

In [None]:
model_gru.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
# Model training
history = model_gru.fit(dataset_train,
          epochs=30,
          validation_data=dataset_test)

Epoch 1/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 115ms/step - accuracy: 0.8758 - loss: 0.3755 - val_accuracy: 0.8659 - val_loss: 0.3941
Epoch 2/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 115ms/step - accuracy: 0.8677 - loss: 0.3908 - val_accuracy: 0.8659 - val_loss: 0.3941
Epoch 3/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 145ms/step - accuracy: 0.8671 - loss: 0.3920 - val_accuracy: 0.8659 - val_loss: 0.3941
Epoch 4/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 114ms/step - accuracy: 0.8683 - loss: 0.3897 - val_accuracy: 0.8659 - val_loss: 0.3940
Epoch 5/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 143ms/step - accuracy: 0.8619 - loss: 0.4016 - val_accuracy: 0.8659 - val_loss: 0.3940
Epoch 6/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 134ms/step - accuracy: 0.8623 - loss: 0.4008 - val_accuracy: 0.8659 - val_loss: 0.3940
Epoch 7/30
[1m61/61[0m 

### Predictions on the train and test sets and evaluation (classification report)

In [None]:

predictions_prob_train = model_gru.predict(dataset_train)

predictions_prob_test = model_gru.predict(dataset_test)

predictions_train = (predictions_prob_train > 0.5).astype(int)
predictions_test = (predictions_prob_test > 0.5).astype(int)

true_labels_train = np.concatenate([y for x, y in dataset_train], axis=0)


true_labels_test = np.concatenate([y for x, y in dataset_test], axis=0)

report_gru_train = classification_report(true_labels_train, predictions_train, target_names=['Class 0', 'Class 1'])
print(report_gru_train)

report_gru_test = classification_report(true_labels_test, predictions_test, target_names=['Class 0', 'Class 1'])
print(report_gru_test)

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 22ms/step
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 19ms/step
              precision    recall  f1-score   support

     Class 0       0.87      1.00      0.93      3381
     Class 1       0.00      0.00      0.00       519

    accuracy                           0.87      3900
   macro avg       0.43      0.50      0.46      3900
weighted avg       0.75      0.87      0.81      3900

              precision    recall  f1-score   support

     Class 0       0.87      1.00      0.93      4825
     Class 1       0.00      0.00      0.00       747

    accuracy                           0.87      5572
   macro avg       0.43      0.50      0.46      5572
weighted avg       0.75      0.87      0.80      5572



#### Loss and accuracy visualisation GRU model


In [None]:

fig = go.Figure()
fig.add_trace(go.Scatter(y=model_gru.history.history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=model_gru.history.history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

In [None]:

fig = go.Figure()
fig.add_trace(go.Scatter(y=model_gru.history.history["accuracy"],
                    mode='lines',
                    name='accuracy'))
fig.add_trace(go.Scatter(y=model_gru.history.history["val_accuracy"],
                    mode='lines',
                    name='val_accuracy'))
fig.show()

### LSTM model

In [None]:
model_lstm = tf.keras.Sequential([
                #Embedding(vocab_size, embedding_dim, name="embedding")
                Embedding(vocab_size, 32, name="embedding"),
                LSTM(units=8, return_sequences=False), #
                Dense(4, activation='relu'), # a dense layer
                Dense(1, activation="sigmoid")#prediction layer
])

In [None]:
model_lstm.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
# Model training
history = model_lstm.fit(dataset_train,
          epochs=30,
          validation_data=dataset_test)

Epoch 1/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 93ms/step - accuracy: 0.8666 - loss: 0.6662 - val_accuracy: 0.8659 - val_loss: 0.6511
Epoch 2/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 109ms/step - accuracy: 0.8752 - loss: 0.5290 - val_accuracy: 0.8659 - val_loss: 0.3942
Epoch 3/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 92ms/step - accuracy: 0.8732 - loss: 0.3811 - val_accuracy: 0.8659 - val_loss: 0.3941
Epoch 4/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 119ms/step - accuracy: 0.8655 - loss: 0.3954 - val_accuracy: 0.8659 - val_loss: 0.3941
Epoch 5/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 90ms/step - accuracy: 0.8689 - loss: 0.3890 - val_accuracy: 0.8659 - val_loss: 0.3941
Epoch 6/30
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 90ms/step - accuracy: 0.8661 - loss: 0.3940 - val_accuracy: 0.8659 - val_loss: 0.3941
Epoch 7/30
[1m61/61[0m [32m━

### Predictions on the train and test sets and evaluation (classification report)

In [None]:
predictions_prob_train = model_lstm.predict(dataset_train)

predictions_prob_test = model_lstm.predict(dataset_test)

predictions_train = (predictions_prob_train > 0.5).astype(int)
predictions_test = (predictions_prob_test > 0.5).astype(int)

true_labels_train = np.concatenate([y for x, y in dataset_train], axis=0)


true_labels_test = np.concatenate([y for x, y in dataset_test], axis=0)

report_lstm_train = classification_report(true_labels_train, predictions_train, target_names=['Class 0', 'Class 1'])
print(report_lstm_train)

report_lstm_test = classification_report(true_labels_test, predictions_test, target_names=['Class 0', 'Class 1'])
print(report_lstm_test)

[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step
[1m88/88[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step
              precision    recall  f1-score   support

     Class 0       0.87      1.00      0.93      3381
     Class 1       0.00      0.00      0.00       519

    accuracy                           0.87      3900
   macro avg       0.43      0.50      0.46      3900
weighted avg       0.75      0.87      0.81      3900

              precision    recall  f1-score   support

     Class 0       0.87      1.00      0.93      4825
     Class 1       0.00      0.00      0.00       747

    accuracy                           0.87      5572
   macro avg       0.43      0.50      0.46      5572
weighted avg       0.75      0.87      0.80      5572



#### Loss and accuracy visualisation LSTM model


In [None]:

# Check if the 'loss' key exists
if 'loss' in history.history:
    fig = go.Figure()

    # Plot training loss
    fig.add_trace(go.Scatter(y=history.history['loss'],
                              mode='lines',
                              name='Training Loss'))

    # Plot validation loss if available
    if 'val_loss' in history.history:
        fig.add_trace(go.Scatter(y=history.history['val_loss'],
                                  mode='lines',
                                  name='Validation Loss'))

    fig.update_layout(title='Model Loss',
                      xaxis_title='Epochs',
                      yaxis_title='Loss')
    fig.show()
else:
    print("Loss key is missing from history.")


In [None]:
if 'accuracy' in history.history:
    # Plot training accuracy
    fig.add_trace(go.Scatter(y=history.history['accuracy'],
                              mode='lines',
                              name='Training Accuracy'))

    # Plot validation accuracy if available
    if 'val_accuracy' in history.history:
        fig.add_trace(go.Scatter(y=history.history['val_accuracy'],
                                  mode='lines',
                                  name='Validation Accuracy'))

# Update layout with titles and labels
fig.update_layout(title='Model Loss and Accuracy',
                  xaxis_title='Epochs',
                  yaxis_title='Value',
                  legend_title='Metrics')

# Show the figure
fig.show()


# Conclusions simple dense layer model, RNN, GRU, and LSTM traininings

### Since the best perfomances are from the simple dense layers deep learning model, we will try to obtain better results from this model.

### RNN model's results were sligtly worse than first dense layer model, GRU's and LSTM's were even worse. This might be due to the fact that dataset consists of short emails that could be spam. The messages are mostly short (75% of them have 23 or fewer tokens). Therefore, the sequential information from each messagge may not be important, as it is for RNN, GRU or LSTM models. That is why, in this case, they are likely to overfit much more than a simple dense layer model. This is clear in the GRU and LSTM models' results.


# We will train a new model on a embedded text with fastText and not directly with keras Tokenizer



## We will create an embedding column with fastText:

### In order to improve the first model's results, we will apply word embedding from Fast Text model (trained with much larger Dataset on English language), so that we can obtain more precise information the messages. Untill now the embedding in this project has been calculated only from the tokens within this dataset through keras Tokeniser.


In [None]:
pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp310-cp310-linux_x86_64.whl size=4296186 sha256=a0f3f4547c0262f575d725ba8e320754d3582f1920d4dd5770210b7e74e645c9
  Stored in directory: /root/.cache/pip/wheels/0d/a2/00/81db54d3e6a8199b829d58

In [None]:
pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [None]:
import re
from unidecode import unidecode
import numpy as np
import pandas as pd

### Embedding

In [None]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

--2024-10-12 11:30:05--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 13.35.7.38, 13.35.7.82, 13.35.7.128, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|13.35.7.38|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4503593528 (4.2G) [application/octet-stream]
Saving to: ‘cc.en.300.bin.gz’


2024-10-12 11:30:46 (107 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]



In [None]:
!gunzip cc.en.300.bin.gz

In [None]:
import fasttext

model_path = 'cc.en.300.bin'
fasttext_model = fasttext.load_model(model_path)

#### Embedding function

In [None]:
def get_embeddings(text):
    return fasttext_model.get_sentence_vector(text)  # Transform text to embeddings

data = dataset
data.head()

Unnamed: 0,v1,v2,messages_clean,messages_tokenized,clean_token
0,0,"Go until jurong point, crazy.. Available only ...",Go until jurong point crazy Available only in ...,"[go, until, jurong, point, crazy, available, o...",go until jurong point crazy available only in ...
1,0,Ok lar... Joking wif u oni...,Ok lar Joking wif u oni,"[ok, lar, Joking, wif, u, oni]",ok lar Joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, in, 2, a, wkly, comp, to, win, F...",free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...,U dun say so early hor U c already then say,"[u, dun, say, so, early, hor, u, c, already, t...",u dun say so early hor u c already then say
4,0,"Nah I don't think he goes to usf, he lives aro...",Nah I dont think he goes to usf he lives aroun...,"[nah, I, do, not, think, he, go, to, usf, he, ...",nah I do not think he go to usf he live around...


In [None]:
data['embeddings'] = data['clean_token'].apply(get_embeddings)

In [None]:
data.embeddings.head()

Unnamed: 0,embeddings
0,"[-0.0020820126, -0.019084563, -0.033937097, 0...."
1,"[0.01780101, -0.038492013, -0.054694664, 0.031..."
2,"[0.0032815218, 0.01743237, -0.024007099, 0.017..."
3,"[0.009102057, -0.023820514, -0.022306414, 0.03..."
4,"[-0.00010253755, -0.00674738, -0.01990201, 0.0..."


### Train - test split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = np.array(data['embeddings'].tolist())  # Convert to 2D array
y = np.array(data['v1'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Modèle

In [None]:
#Pas besoin d'une couche embedding
model_fast_text = Sequential([
    Dense(6, activation='relu', input_shape=(X.shape[1],)),  # Specify input shape
    Dense(1, activation='sigmoid')                             # Output layer for binary classification
])

# Compile the model
model_fast_text.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Summary of the model
model_fast_text.summary()

In [None]:
model_fast_text = Sequential([
  #GlobalAveragePooling1D(), # Couche nécessaire pour pouvoir ensuite introduire les couches Dense # this will pick the average for every word in the sentence
  # along each dimension of the representation space.
  Dense(8, activation='relu'), # Couche dense
  Dense(1, activation="sigmoid") # Couche dense de prédiction avec une fonction d'activation sigmoide car il s'agit d'un classification binaire (0 ou 1 / non spam ou spam)
])

In [None]:
model_fast_text.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(),
              metrics=['accuracy'])

In [None]:
# Model training
# Fit the model
history = model_fast_text.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))


Epoch 1/30
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8403 - loss: 0.5610 - val_accuracy: 0.8655 - val_loss: 0.3330
Epoch 2/30
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8695 - loss: 0.3103 - val_accuracy: 0.8664 - val_loss: 0.2702
Epoch 3/30
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8613 - loss: 0.2667 - val_accuracy: 0.8870 - val_loss: 0.2228
Epoch 4/30
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.8931 - loss: 0.2169 - val_accuracy: 0.9274 - val_loss: 0.1836
Epoch 5/30
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9294 - loss: 0.1773 - val_accuracy: 0.9417 - val_loss: 0.1553
Epoch 6/30
[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9488 - loss: 0.1503 - val_accuracy: 0.9525 - val_loss: 0.1389
Epoch 7/30
[1m140/140[0m 

Model's loss and accuracy across epochs:

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=model_fast_text.history.history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=model_fast_text.history.history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=model_fast_text.history.history["accuracy"],
                    mode='lines',
                    name='accuracy'))
fig.add_trace(go.Scatter(y=model_fast_text.history.history["val_accuracy"],
                    mode='lines',
                    name='val_accuracy'))
fig.show()

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(y=model1.history.history["loss"],
                    mode='lines',
                    name='loss'))
fig.add_trace(go.Scatter(y=model1.history.history["val_loss"],
                    mode='lines',
                    name='val_loss'))
fig.show()

# Final results and conclusions

### Classification reports from all the models trained in this notebook are listed below

### The comparison between the classification reports and the loss and accuracy scores visualisations from each model training show that the best results were obtained from the last tentative consisting in:

### 1) Obtaining  an embedding column from the original messages columns from a more sophisticated model like fastText.

### 2) with this embedding from fastText model, convering the whole English language, we train a simple 2 dense layer model.

### Classication reports from each model

Classification reports from each model are provided for comparison. The model with two dense layers, trained on the word embeddings obtained with fastText, achieved the best results.

In [None]:
# Make predictions on the training data
y_train_pred = (model_fast_text.predict(X_train) > 0.5).astype("int32")

# Make predictions on the test data
y_test_pred = (model_fast_text.predict(X_test) > 0.5).astype("int32")



print("Classification Report for Training Set:")
classification_report_fasttext_train = classification_report(y_train, y_train_pred)
print(classification_report_fasttext_train)

# Classification report for the testing set
print("Classification Report for Testing Set:")
classification_report_fasttext_test = classification_report(y_test, y_test_pred)
print(classification_report_fasttext_test)

[1m140/140[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m35/35[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Classification Report for Training Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3860
           1       0.95      0.88      0.91       597

    accuracy                           0.98      4457
   macro avg       0.97      0.94      0.95      4457
weighted avg       0.98      0.98      0.98      4457

Classification Report for Testing Set:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.93      0.91      0.92       150

    accuracy                           0.98      1115
   macro avg       0.96      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
print("Dense layers model with fasttext embedding")
print("Classification Report for Training Set:")
print(classification_report_fasttext_train)

# Classification report for the testing set
print("Classification Report for Testing Set:")
print(classification_report_fasttext_test)

Dense layers model with fasttext embedding
Classification Report for Training Set:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99      3860
           1       0.95      0.88      0.91       597

    accuracy                           0.98      4457
   macro avg       0.97      0.94      0.95      4457
weighted avg       0.98      0.98      0.98      4457

Classification Report for Testing Set:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.93      0.91      0.92       150

    accuracy                           0.98      1115
   macro avg       0.96      0.95      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [None]:
print("Dense layers model without fasttext embedding")
print("Classification Report for Training Set:")
print(report_dense_train)
print("Classification Report for Testing Set:")

print(report_dense_test)


Dense layers model
Classification Report for Training Set:
              precision    recall  f1-score   support

     Class 0       0.87      0.88      0.87      3381
     Class 1       0.13      0.12      0.12       519

    accuracy                           0.78      3900
   macro avg       0.50      0.50      0.50      3900
weighted avg       0.77      0.78      0.77      3900

Classification Report for Testing Set:
              precision    recall  f1-score   support

     Class 0       0.98      1.00      0.99      4825
     Class 1       1.00      0.85      0.92       747

    accuracy                           0.98      5572
   macro avg       0.99      0.93      0.95      5572
weighted avg       0.98      0.98      0.98      5572



In [None]:
print(" RNN model")
print("Classification Report for Training Set:")
print(report_rnn_train)
print("Classification Report for Testing Set:")
print(report_rnn_test)

 RNN model
Classification Report for Training Set:
              precision    recall  f1-score   support

     Class 0       0.87      0.87      0.87      3381
     Class 1       0.14      0.14      0.14       519

    accuracy                           0.77      3900
   macro avg       0.51      0.51      0.51      3900
weighted avg       0.77      0.77      0.77      3900

Classification Report for Testing Set:
              precision    recall  f1-score   support

     Class 0       0.98      0.99      0.99      4825
     Class 1       0.95      0.88      0.91       747

    accuracy                           0.98      5572
   macro avg       0.96      0.94      0.95      5572
weighted avg       0.98      0.98      0.98      5572



In [None]:
print("GRU model")
print("Classification Report for Training Set:")
print(report_gru_train)
print("Classification Report for Testing Set:")
print(report_gru_test)

GRU model
Classification Report for Training Set:
              precision    recall  f1-score   support

     Class 0       0.87      1.00      0.93      3381
     Class 1       0.00      0.00      0.00       519

    accuracy                           0.87      3900
   macro avg       0.43      0.50      0.46      3900
weighted avg       0.75      0.87      0.81      3900

Classification Report for Testing Set:
              precision    recall  f1-score   support

     Class 0       0.87      1.00      0.93      4825
     Class 1       0.00      0.00      0.00       747

    accuracy                           0.87      5572
   macro avg       0.43      0.50      0.46      5572
weighted avg       0.75      0.87      0.80      5572



In [None]:
print("LSTM model")
print("Classification Report for Training Set:")
print(report_lstm_train)
print("Classification Report for Testing Set:")
print(report_lstm_test)

LSTM model
Classification Report for Training Set:
              precision    recall  f1-score   support

     Class 0       0.87      1.00      0.93      3381
     Class 1       0.00      0.00      0.00       519

    accuracy                           0.87      3900
   macro avg       0.43      0.50      0.46      3900
weighted avg       0.75      0.87      0.81      3900

Classification Report for Testing Set:
              precision    recall  f1-score   support

     Class 0       0.87      1.00      0.93      4825
     Class 1       0.00      0.00      0.00       747

    accuracy                           0.87      5572
   macro avg       0.43      0.50      0.46      5572
weighted avg       0.75      0.87      0.80      5572

