# MLOPS

First project of the mlops cours. It needed to make a deep learning model that expres the feeling of text data and train it in a mlflow serve

## vectorizing

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop

In [4]:
df_train = pd.read_csv('../data/train/train.csv', index_col=0)
df_train = df_train.sample(frac=1).reset_index(drop=True).iloc[:20000]
df_test = pd.read_csv('../data/test/test.csv', index_col=0, nrows=5000)
df_validation = pd.read_csv('../data/valid/valid.csv', index_col=0,nrows=5000)

In [5]:
vectorize = TfidfVectorizer(max_features=5000, strip_accents='ascii', lowercase=True, stop_words=list(fr_stop)) 
vectorize.fit(df_train["review"]) 
X_train = vectorize.transform(df_train["review"]) 
X_test = vectorize.transform(df_test["review"]) 
X_validation = vectorize.transform(df_validation["review"])



In [6]:
df_X_train = pd.DataFrame(X_train.toarray())
df_X_test = pd.DataFrame(X_test.toarray())
df_X_validation = pd.DataFrame(X_validation.toarray())

In [7]:
df_X_train["polarity"]=df_train["polarity"]
df_X_test["polarity"]=df_test["polarity"]
df_X_validation["polarity"]=df_validation["polarity"]

In [8]:
df_X_train.to_csv('../data/train/exp_train.csv',index=False)
df_X_test.to_csv('../data/test/exp_test.csv',index=False)
df_X_validation.to_csv('../data/valid/exp_valid.csv',index=False)

In [9]:
print("First document TF-IDF features:")
print(X_train.shape)

First document TF-IDF features:
(20000, 5000)


In [10]:
del X_test, X_train, X_validation, df_X_test, df_X_train, df_X_validation, vectorize, df_validation, df_train, df_test

## modele

### MLP

In [24]:
import torch
import torch.nn as nn
import torch.optim as opt
from torch.utils.data import TensorDataset, DataLoader
from datetime import datetime
import random
import string
import os
from tqdm import tqdm

class PolarityNN(nn.Module):
    """
        this model is for film review to predict if the review is good or bad

    """
    def __init__(self, input_size=5000, hidden_size=128):
        super(PolarityNN, self).__init__()

        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        rand_suffix = ''.join(random.choices(string.ascii_lowercase + string.digits, k=4))
        self.model_name = f"PolarityNN_{timestamp}_{rand_suffix}" 
        
        # neural layer
        self.model = nn.Sequential(
            # layer: 5000 -> 128
            nn.Linear(input_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            
            # layer: 128 -> 64
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            
            # output layer: 64 -> 1 (classification binaire)
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
        
    def forward(self, x):
        # x => (batch_size, 5000)
        return self.model(x)


    def predict(self, x):
        with torch.no_grad():
            probs = self.forward(x)
            return (probs > 0.5).float()
        
    def predict_batch(self, data_loader):
        self.eval()
        with torch.no_grad():
            predicted = []
            true = []
            
            for batch_x, batch_y in data_loader:
                probs = self(batch_x)              # sorties sigmoïdes entre 0 et 1
                preds = (probs > 0.5).float()      # arrondi à 0/1

                predicted.append(preds)
                true.append(batch_y)

            # concatène toutes les prédictions
            predicted = torch.cat(predicted, dim=0)
            true = torch.cat(true, dim=0)

            return predicted, true

            
    def train_polarityNN(self, train_loader, val_loader, num_epochs=10, learning_rate=0.001):
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        
        train_losses = []
        val_losses = []
        
        progress =tqdm(range(num_epochs*len(train_loader)))
        for epoch in range(num_epochs):
            # torch training mode
            self.train()
            total_train_loss = 0
            
            for batch_x, batch_y in train_loader:
                # Forward pass
                outputs = self(batch_x)
                loss = criterion(outputs, batch_y.float().view(-1, 1))
                
                # Backward pass and optim
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                
                total_train_loss += loss.item()
                progress.update(1)
                progress.refresh()
            avg_train_loss = total_train_loss / len(train_loader)
            train_losses.append(avg_train_loss)
            
            # torch eval
            self.eval()
            total_val_loss = 0
            correct = 0
            total = 0
            
            with torch.no_grad():
                for batch_x, batch_y in val_loader:
                    outputs = self(batch_x)
                    loss = criterion(outputs, batch_y.float().view(-1, 1))
                    total_val_loss += loss.item()

                    predicted = (outputs > 0.5).float()
                    total += batch_y.size(0)
                    correct += (predicted.view(-1) == batch_y).sum().item()
            
            avg_val_loss = total_val_loss / len(val_loader)
            val_losses.append(avg_val_loss)
            accuracy = 100 * correct / total
            
            tqdm.write(f'Epoch [{epoch+1}/{num_epochs}]')
            tqdm.write(f'Train Loss: {avg_train_loss:.4f}')
            tqdm.write(f'Val Loss: {avg_val_loss:.4f}')
            tqdm.write(f'Val Accuracy: {accuracy:.2f}%\n')
        
        return train_losses, val_losses
    
    def save(self, folder_name="trained"):
        base_dir = os.path.dirname(os.path.abspath(__file__))
        save_dir = os.path.join(base_dir, folder_name)
        
        os.makedirs(save_dir, exist_ok=True)
        path = os.path.join(save_dir, f"{self.model_name}.pt")
        
        torch.save(self.state_dict(), path)
        print(f"Model saved at: {path}")
        return path

In [4]:
df_train = pd.read_csv('../data/train/exp_train.csv')
df_test = pd.read_csv('../data/test/exp_test.csv', nrows=2000)
df_val = pd.read_csv('../data/valid/exp_valid.csv')
df_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4991,4992,4993,4994,4995,4996,4997,4998,4999,polarity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [13]:
train_tensor = TensorDataset(torch.tensor(df_train.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_train["polarity"].to_numpy(), dtype=torch.float))
test_tensor = TensorDataset(torch.tensor(df_test.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_test["polarity"].to_numpy(), dtype=torch.float))
val_tensor = TensorDataset(torch.tensor(df_val.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_val["polarity"].to_numpy(), dtype=torch.float))


In [14]:
train_tensor = TensorDataset(torch.tensor(df_train.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_train["polarity"].to_numpy(), dtype=torch.float))
test_tensor = TensorDataset(torch.tensor(df_test.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_test["polarity"].to_numpy(), dtype=torch.float))
val_tensor = TensorDataset(torch.tensor(df_val.drop(columns="polarity").to_numpy(),dtype=torch.float),torch.tensor(df_val["polarity"].to_numpy(), dtype=torch.float))


In [15]:
train_loader = DataLoader(train_tensor, batch_size=30)
test_loader = DataLoader(test_tensor, batch_size=30)
val_loader = DataLoader(val_tensor, batch_size=30)

In [26]:
model = PolarityNN()
model.train_polarityNN(train_loader,val_loader, num_epochs=25)

  4%|▍         | 687/16675 [00:08<04:50, 54.95it/s] 

Epoch [1/25]
Train Loss: 0.3282
Val Loss: 0.2486
Val Accuracy: 89.46%



  8%|▊         | 1354/16675 [00:14<03:33, 71.60it/s] 

Epoch [2/25]
Train Loss: 0.1854
Val Loss: 0.2687
Val Accuracy: 89.34%



 12%|█▏        | 2024/16675 [00:20<03:18, 73.73it/s] 

Epoch [3/25]
Train Loss: 0.1339
Val Loss: 0.3141
Val Accuracy: 89.08%



 16%|█▌        | 2691/16675 [00:25<03:09, 73.95it/s] 

Epoch [4/25]
Train Loss: 0.0835
Val Loss: 0.3781
Val Accuracy: 88.66%



 20%|██        | 3357/16675 [00:31<03:06, 71.40it/s] 

Epoch [5/25]
Train Loss: 0.0501
Val Loss: 0.4783
Val Accuracy: 88.12%



 24%|██▍       | 4018/16675 [00:38<04:28, 47.12it/s] 

Epoch [6/25]
Train Loss: 0.0296
Val Loss: 0.6344
Val Accuracy: 88.40%



 28%|██▊       | 4692/16675 [00:44<02:44, 72.97it/s] 

Epoch [7/25]
Train Loss: 0.0179
Val Loss: 0.8252
Val Accuracy: 88.66%



 32%|███▏      | 5360/16675 [00:50<02:50, 66.18it/s] 

Epoch [8/25]
Train Loss: 0.0119
Val Loss: 0.9720
Val Accuracy: 88.54%



 36%|███▌      | 6021/16675 [00:56<02:37, 67.58it/s] 

Epoch [9/25]
Train Loss: 0.0107
Val Loss: 1.1693
Val Accuracy: 88.48%



 40%|████      | 6692/16675 [01:01<02:18, 71.91it/s] 

Epoch [10/25]
Train Loss: 0.0079
Val Loss: 1.4912
Val Accuracy: 88.40%



 44%|████▍     | 7360/16675 [01:07<02:16, 68.15it/s] 

Epoch [11/25]
Train Loss: 0.0059
Val Loss: 1.6770
Val Accuracy: 88.24%



 48%|████▊     | 8026/16675 [01:13<02:29, 58.01it/s] 

Epoch [12/25]
Train Loss: 0.0046
Val Loss: 1.9331
Val Accuracy: 88.30%



 52%|█████▏    | 8693/16675 [01:19<02:10, 61.29it/s] 

Epoch [13/25]
Train Loss: 0.0045
Val Loss: 2.0011
Val Accuracy: 88.30%



 56%|█████▌    | 9350/16675 [01:25<03:18, 36.85it/s] 

Epoch [14/25]
Train Loss: 0.0047
Val Loss: 2.1717
Val Accuracy: 88.46%



 60%|██████    | 10027/16675 [01:32<01:45, 63.12it/s] 

Epoch [15/25]
Train Loss: 0.0041
Val Loss: 1.8856
Val Accuracy: 88.50%



 64%|██████▍   | 10696/16675 [01:37<01:26, 69.50it/s] 

Epoch [16/25]
Train Loss: 0.0022
Val Loss: 2.1443
Val Accuracy: 88.56%



 68%|██████▊   | 11362/16675 [01:43<01:13, 72.38it/s] 

Epoch [17/25]
Train Loss: 0.0035
Val Loss: 1.8224
Val Accuracy: 88.40%



 72%|███████▏  | 12028/16675 [01:49<01:18, 59.54it/s] 

Epoch [18/25]
Train Loss: 0.0023
Val Loss: 2.8144
Val Accuracy: 87.68%



 76%|███████▌  | 12695/16675 [01:55<00:56, 70.92it/s] 

Epoch [19/25]
Train Loss: 0.0024
Val Loss: 2.5530
Val Accuracy: 88.62%



 80%|████████  | 13357/16675 [02:02<01:01, 54.01it/s] 

Epoch [20/25]
Train Loss: 0.0022
Val Loss: 2.6246
Val Accuracy: 88.36%



 84%|████████▍ | 14015/16675 [02:08<00:21, 122.70it/s]

Epoch [21/25]
Train Loss: 0.0029
Val Loss: 2.4604
Val Accuracy: 88.66%



 88%|████████▊ | 14695/16675 [02:14<00:31, 62.51it/s] 

Epoch [22/25]
Train Loss: 0.0028
Val Loss: 2.5231
Val Accuracy: 88.48%



 92%|█████████▏| 15361/16675 [02:20<00:20, 63.24it/s] 

Epoch [23/25]
Train Loss: 0.0020
Val Loss: 3.0414
Val Accuracy: 88.36%



 96%|█████████▌| 16030/16675 [02:26<00:09, 65.40it/s] 

Epoch [24/25]
Train Loss: 0.0021
Val Loss: 2.4170
Val Accuracy: 88.18%



100%|██████████| 16675/16675 [02:32<00:00, 109.58it/s]

Epoch [25/25]
Train Loss: 0.0036
Val Loss: 2.3361
Val Accuracy: 88.34%






([0.3282160347514692,
  0.18543077609885758,
  0.13387718170914784,
  0.08351629696202756,
  0.050123033253580744,
  0.029642132420607353,
  0.01786273420699378,
  0.011876074707859684,
  0.010730550798761361,
  0.007907669472315129,
  0.005894199527146541,
  0.004613679294203084,
  0.004492594836513202,
  0.004710974080600115,
  0.004114080734020728,
  0.002182774950618086,
  0.0035357534323728255,
  0.0023360206829916652,
  0.0023684301156319046,
  0.00216392207141746,
  0.0029341196792600607,
  0.0027981520218564038,
  0.002019071843791756,
  0.00206646158823086,
  0.003569417101468822],
 [0.24862534227128513,
  0.26872654993109363,
  0.31412516286719344,
  0.37810112571644927,
  0.47826966289765466,
  0.634382216109264,
  0.8251633445138368,
  0.9719599700995727,
  1.169260451140741,
  1.491158683113698,
  1.6769974613624965,
  1.9331328971409043,
  2.001092997416547,
  2.1716940231961277,
  1.8855638121797433,
  2.1443299110828966,
  1.822386942705569,
  2.814374225200443,
  2.553

In [2]:
def compute_metrics(pred, true):
    # pred et true doivent être des tensors 0/1
    if type(pred) != np.ndarray:
        pred = pred.int()
        true = true.int()

    TP = ((pred == 1) & (true == 1)).sum().item()
    TN = ((pred == 0) & (true == 0)).sum().item()
    FP = ((pred == 1) & (true == 0)).sum().item()
    FN = ((pred == 0) & (true == 1)).sum().item()

    accuracy  = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall    = TP / (TP + FN) if (TP + FN) > 0 else 0

    return accuracy, precision, recall


In [30]:
pred, true = model.predict_batch(test_loader)
accuracy, precision, recall = compute_metrics(pred, true)

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)


Accuracy : 0.501207
Precision: 0.4645
Recall   : 0.483


### LogisticRegression

In [1]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

In [5]:
LR = LogisticRegression()
X_train = df_train.drop(columns="polarity").to_numpy()
y_train = df_train["polarity"].to_numpy()

X_test = df_test.drop(columns="polarity").to_numpy()
y_test = df_test["polarity"].to_numpy()

In [6]:
LR.fit(X_train,y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [6]:
print(LR.score(X_test,y_test))
predict = LR.predict(X_test)

0.894


In [9]:
accuracy, precision, recall = compute_metrics(predict, y_test)

print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)

Accuracy : 0.894
Precision: 0.8738269030239834
Recall   : 0.9020452099031216


In [7]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

param_dist = {
    'C': uniform(loc=0, scale=4),
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'] 
}

logreg = LogisticRegression(max_iter=1000, random_state=42)

random_search = RandomizedSearchCV(
    estimator=logreg,
    param_distributions=param_dist,
    n_iter=50,        
    cv=5,             
    scoring='accuracy',
    random_state=42,
    n_jobs=-1        
)

random_search.fit(X_train, y_train)

print("best hyper-param :", random_search.best_params_)
print("best score CV :", random_search.best_score_)
print("training set core :", random_search.best_estimator_.score(X_test, y_test))

best hyper-param : {'C': np.float64(1.2468443043576438), 'penalty': 'l2', 'solver': 'saga'}
best score CV : 0.8964000000000001
training set core : 0.892
