# MLOPS

First project of the mlops cours. It needed to make a deep learning model that expres the feeling of text data and train it in a mlflow serve

## Data analysis

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [2]:
df_train = pd.read_csv('../data/train.csv', index_col=0)
df_train = df_train.sample(frac=1).reset_index(drop=True).iloc[:20000]
df_test = pd.read_csv('../data/test.csv', index_col=0, nrows=5000)
df_validation = pd.read_csv('../data/valid.csv', index_col=0,nrows=5000)

df_train["polarity"].value_counts()

polarity
1    10046
0     9954
Name: count, dtype: int64

In [3]:
df_train.shape

(20000, 3)

So the data is Film review with 2 colones usable. The review and the polarity(label

In [4]:
df_train.isnull().sum()

film-url    0
review      0
polarity    0
dtype: int64

In [5]:
l = list(df_train["review"])
lenths = [len(i.split()) for i in l]
print("review is between", np.min(lenths), "and", np.max(lenths), "words long\nand have an average of", np.mean(lenths), "words")
del l, lenths

review is between 1 and 373 words long
and have an average of 91.80115 words


In [6]:
import re

# Nettoyage du texte - suppression des caractères spéciaudf_train["review"] = df_train["review"].apply(lambda x: re.sub(r'[^éèêa-zA-Z\s]', '', x)dftrin["review"] = dftrain["review"].apply(lambda x: re.sub(r'[^éèêa-zA-Z\s]', '', x))
# df_train["review"] = df_train["review"].apply(lambda x: re.sub(r'[^éèêa-zA-Z\s]', '', x))
# df_test["review"] = df_test["review"].apply(lambda x: re.sub(r'[^éèêa-zA-Z\s]', '', x))
# df_validation["review"] = df_validation["review"].apply(lambda x: re.sub(r'[^éèêa-zA-Z\s]', '', x))

In [7]:
df_train["review"]

0        4/5 Un bon film qui en étonnera plus de 1 je p...
1        Vu 2 fois en 3 jours : jeu époustouflant, mise...
2        J'ai grandi avec ce film pour référence en ce ...
3        Ce film est d'une telle beauté que maintenant,...
4        ....Je viens de visionner et j'étais par momen...
                               ...                        
19995    excellent divertissement émouvant,dramatique e...
19996    tres beau documentaire - un homme se fait taba...
19997    Quand on n'a pas lu le bouquin, et qu'on voit ...
19998    Une tache dans la carrière de Patrice Leconte,...
19999    Une faute de gout, une comédie italienne resse...
Name: review, Length: 20000, dtype: object

## vectorizing

In [8]:
vectorize = TfidfVectorizer(max_features=5000, strip_accents='ascii', lowercase=True, stop_words=list(fr_stop)) 
vectorize.fit(df_train["review"]) 
X_train = vectorize.transform(df_train["review"]) 
X_test = vectorize.transform(df_test["review"]) 
X_validation = vectorize.transform(df_validation["review"])



In [9]:
df_X_train = pd.DataFrame(X_train.toarray())
df_X_test = pd.DataFrame(X_test.toarray())
df_X_validation = pd.DataFrame(X_validation.toarray())

In [10]:
df_X_train["polarity"]=df_train["polarity"]
df_X_test["polarity"]=df_test["polarity"]
df_X_validation["polarity"]=df_validation["polarity"]

In [11]:
df_X_train.to_csv('../data/exp_train.csv',index=False)
df_X_test.to_csv('../data/exp_test.csv',index=False)
df_X_validation.to_csv('../data/exp_valid.csv',index=False)

In [12]:
print("First document TF-IDF features:")
print(X_train.shape)

First document TF-IDF features:
(20000, 5000)


In [13]:
del X_test, X_train, X_validation, df_X_test, df_X_train, df_X_validation, vectorize, df_validation, df_train, df_test

## modele

In [23]:
import torch
import torch.nn as nn
import torch.optim as opt
from torch.utils.data import TensorDataset, DataLoader
from datetime import datetime
import random
import string
import os
from tqdm import tqdm

class PolarityNN(nn.Module):
    """
        this model is for film review to predict if the review is good or bad

    """
    def __init__(self, input_size=5000, hidden_size=128):
        super(PolarityNN, self).__init__()

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        rand_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=4))
        self.model_name = f"PolarityNN_{timestamp}_{rand_suffix}" 
        
        # neural layer
        self.model = nn.Sequential(
            # layer: 5000 -> 128
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # layer: 128 -> 64
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # output layer: 64 -> 1 (classification binaire)
            nn.Linear(64, 1),
            nn.Sigmoid()  
        )
        
    def forward(self, x):
        # x => (batch_size, 5000)
        return self.model(x)


    def predict(self, x):
        with torch.no_grad():
            probs = self.forward(x)
            return (probs > 0.5).float()
            
    def train_polarityNN(self, train_loader, val_loader, num_epochs=10, learning_rate=0.001):
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        
        train_losses = []
        val_losses = []
        
        progress =tqdm(range(num_epochs*len(train_loader)))
        for epoch in range(num_epochs):
            # torch training mode
            self.train()
            total_train_loss = 0
            
            for batch_x, batch_y in train_loader:
                # Forward pass
                outputs = self(batch_x)
                loss = criterion(outputs, batch_y.float().view(-1, 1))
                
                # Backward pass and optim
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_train_loss += loss.item()
                progress.update(1)
                progress.refresh()
            avg_train_loss = total_train_loss / len(train_loader)
            train_losses.append(avg_train_loss)
            
            # torch eval
            self.eval()
            total_val_loss = 0
            correct = 0
            total = 0
            
            with torch.no_grad():
                for batch_x, batch_y in val_loader:
                    outputs = self(batch_x)
                    loss = criterion(outputs, batch_y.float().view(-1, 1))
                    total_val_loss += loss.item()

                    predicted = (outputs > 0.5).float()
                    total += batch_y.size(0)
                    correct += (predicted.view(-1) == batch_y).sum().item()
            
            avg_val_loss = total_val_loss / len(val_loader)
            val_losses.append(avg_val_loss)
            accuracy = 100 * correct / total
            
            tqdm.write(f'Epoch [{epoch+1}/{num_epochs}]')
            tqdm.write(f'Train Loss: {avg_train_loss:.4f}')
            tqdm.write(f'Val Loss: {avg_val_loss:.4f}')
            tqdm.write(f'Val Accuracy: {accuracy:.2f}%\n')
        
        return train_losses, val_losses
    
    def save(self, folder_name="trained"):
        base_dir = os.path.dirname(os.path.abspath(__file__))
        save_dir = os.path.join(base_dir, folder_name)
        
        os.makedirs(save_dir, exist_ok=True)
        path = os.path.join(save_dir, f"{self.model_name}.pt")
        
        torch.save(self.state_dict(), path)
        print(f"Model saved at: {path}")
        return path

In [15]:
df_train = pd.read_csv('../data/exp_train.csv')
df_test = pd.read_csv('../data/exp_test.csv', nrows=2000)
df_val = pd.read_csv('../data/exp_valid.csv')
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4991,4992,4993,4994,4995,4996,4997,4998,4999,polarity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [16]:
train_tensor = TensorDataset(torch.tensor(df_train.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_train["polarity"].to_numpy(), dtype=torch.float))
test_tensor = TensorDataset(torch.tensor(df_test.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_test["polarity"].to_numpy(), dtype=torch.float))
val_tensor = TensorDataset(torch.tensor(df_val.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_val["polarity"].to_numpy(), dtype=torch.float))


In [17]:
train_tensor = TensorDataset(torch.tensor(df_train.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_train["polarity"].to_numpy(), dtype=torch.float))
test_tensor = TensorDataset(torch.tensor(df_test.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_test["polarity"].to_numpy(), dtype=torch.float))
val_tensor = TensorDataset(torch.tensor(df_val.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_val["polarity"].to_numpy(), dtype=torch.float))


In [18]:
train_loader = DataLoader(train_tensor, batch_size=30)
test_loader = DataLoader(test_tensor, batch_size=30)
val_loader = DataLoader(val_tensor, batch_size=30)

In [22]:
model = PolarityNN()
model.train_polarityNN(train_loader,val_loader, num_epochs=25)

  0%|          | 0/16675 [00:00<?, ?it/s]

  4%|▍         | 688/16675 [00:07<06:48, 39.16it/s] 

Epoch [1/25]
Train Loss: 0.3260
Val Loss: 0.2394
Val Accuracy: 89.95%



  8%|▊         | 1357/16675 [00:13<03:17, 77.56it/s] 

Epoch [2/25]
Train Loss: 0.1834
Val Loss: 0.2508
Val Accuracy: 89.85%



 12%|█▏        | 2024/16675 [00:18<02:55, 83.43it/s] 

Epoch [3/25]
Train Loss: 0.1274
Val Loss: 0.3026
Val Accuracy: 89.70%



 16%|█▌        | 2690/16675 [00:25<05:19, 43.82it/s] 

Epoch [4/25]
Train Loss: 0.0788
Val Loss: 0.2710
Val Accuracy: 89.20%



 20%|██        | 3357/16675 [00:31<02:49, 78.69it/s] 

Epoch [5/25]
Train Loss: 0.0606
Val Loss: 0.4323
Val Accuracy: 89.05%



 24%|██▍       | 4022/16675 [00:37<02:44, 76.76it/s] 

Epoch [6/25]
Train Loss: 0.0360
Val Loss: 0.6649
Val Accuracy: 88.75%



 28%|██▊       | 4692/16675 [00:43<02:28, 80.74it/s] 

Epoch [7/25]
Train Loss: 0.0251
Val Loss: 0.7987
Val Accuracy: 88.60%



 32%|███▏      | 5359/16675 [00:48<02:20, 80.40it/s] 

Epoch [8/25]
Train Loss: 0.0169
Val Loss: 1.0829
Val Accuracy: 88.80%



 36%|███▌      | 6026/16675 [00:54<02:31, 70.46it/s] 

Epoch [9/25]
Train Loss: 0.0131
Val Loss: 1.4712
Val Accuracy: 88.75%



 40%|████      | 6688/16675 [01:00<02:01, 82.45it/s] 

Epoch [10/25]
Train Loss: 0.0116
Val Loss: 1.2878
Val Accuracy: 88.60%



 44%|████▍     | 7359/16675 [01:05<01:56, 80.23it/s] 

Epoch [11/25]
Train Loss: 0.0122
Val Loss: 1.4869
Val Accuracy: 88.65%



 48%|████▊     | 8023/16675 [01:11<01:51, 77.93it/s] 

Epoch [12/25]
Train Loss: 0.0108
Val Loss: 1.4403
Val Accuracy: 88.55%



 52%|█████▏    | 8692/16675 [01:19<02:09, 61.71it/s] 

Epoch [13/25]
Train Loss: 0.0108
Val Loss: 1.6960
Val Accuracy: 88.60%



 56%|█████▌    | 9357/16675 [01:26<02:10, 56.06it/s] 

Epoch [14/25]
Train Loss: 0.0083
Val Loss: 1.8904
Val Accuracy: 88.50%



 60%|██████    | 10025/16675 [01:33<01:43, 64.18it/s] 

Epoch [15/25]
Train Loss: 0.0080
Val Loss: 1.9906
Val Accuracy: 88.60%



 64%|██████▍   | 10690/16675 [01:39<01:38, 60.57it/s] 

Epoch [16/25]
Train Loss: 0.0080
Val Loss: 1.9933
Val Accuracy: 89.00%



 68%|██████▊   | 11359/16675 [01:46<01:19, 66.50it/s] 

Epoch [17/25]
Train Loss: 0.0075
Val Loss: 2.0418
Val Accuracy: 88.60%



 72%|███████▏  | 12024/16675 [01:53<01:33, 49.61it/s] 

Epoch [18/25]
Train Loss: 0.0075
Val Loss: 2.2308
Val Accuracy: 88.90%



 76%|███████▌  | 12691/16675 [02:00<01:14, 53.30it/s] 

Epoch [19/25]
Train Loss: 0.0064
Val Loss: 2.2635
Val Accuracy: 88.60%



 80%|████████  | 13358/16675 [02:07<01:00, 54.90it/s] 

Epoch [20/25]
Train Loss: 0.0075
Val Loss: 2.5595
Val Accuracy: 88.60%



 84%|████████▍ | 14026/16675 [02:14<00:39, 67.11it/s] 

Epoch [21/25]
Train Loss: 0.0084
Val Loss: 2.1494
Val Accuracy: 88.75%



 88%|████████▊ | 14692/16675 [02:23<00:33, 59.02it/s] 

Epoch [22/25]
Train Loss: 0.0073
Val Loss: 2.2052
Val Accuracy: 88.80%



 92%|█████████▏| 15360/16675 [02:30<00:21, 61.83it/s] 

Epoch [23/25]
Train Loss: 0.0066
Val Loss: 2.5103
Val Accuracy: 88.75%



 96%|█████████▌| 16027/16675 [02:38<00:10, 61.62it/s] 

Epoch [24/25]
Train Loss: 0.0072
Val Loss: 2.3007
Val Accuracy: 88.80%



100%|██████████| 16675/16675 [02:45<00:00, 100.80it/s]

Epoch [25/25]
Train Loss: 0.0067
Val Loss: 2.2797
Val Accuracy: 88.85%






([0.32595861615470684,
  0.18344305086506718,
  0.12744797857386106,
  0.07875471576703419,
  0.06058747746030112,
  0.03599515944682051,
  0.025129523374388278,
  0.01694756668839709,
  0.013070438302824148,
  0.011556760380098768,
  0.01219957363811215,
  0.010804984652338718,
  0.010814896335359984,
  0.008265953300408725,
  0.0080289415914923,
  0.008030035133953077,
  0.007479197127874644,
  0.007480609319812733,
  0.006400956667810289,
  0.0074880968213448884,
  0.00841179771974228,
  0.007303248721319285,
  0.006554405912623364,
  0.0072043465310689695,
  0.006709857508571664],
 [0.23936297010574767,
  0.2508171899105186,
  0.30263835444712817,
  0.2710432862835144,
  0.4323046344763307,
  0.6648916329966107,
  0.7987228052537722,
  1.0828683291207444,
  1.4711851890801813,
  1.287840352073979,
  1.4869231340881965,
  1.4402871107110935,
  1.6959943778103967,
  1.8903514984882186,
  1.9906377260453318,
  1.99333435788563,
  2.0417609663042517,
  2.230826181762699,
  2.2635283768