In [66]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix

from hyperopt import tpe, STATUS_OK, Trials, hp, fmin

import mlflow
mlflow.set_experiment("Mlflow test 2")

<Experiment: artifact_location='file:///c:/Users/petit/Openclassrooms/P7/mlruns/590560026109209413', creation_time=1741184161302, experiment_id='590560026109209413', last_update_time=1741184161302, lifecycle_stage='active', name='Mlflow test 2', tags={}>

In [67]:
def objective(params):
    with mlflow.start_run(nested=True):
        model = LogisticRegression(**params, random_state=42)
        
        scores = cross_val_score(model, X_train, y_train, cv=5, n_jobs=-1, scoring='accuracy')
        
        average_score = round(sum(scores) / 5, 3)
        
        loss = 1 - average_score
        
        
        mlflow.log_metric('Accuracy', average_score)
        mlflow.log_params(params)
        
        return {'loss': loss, 'params': params, 'status': STATUS_OK}

In [68]:
space = {
    'tol': hp.uniform('tol', 0.00001, 0.0001),
    'C': hp.uniform('C', 0.05, 3)
}

In [69]:
model_name = ""

In [70]:
df = pd.read_csv('preprocessed_tweet.csv')
df.head()

Unnamed: 0,Target,Tweet_preprocessed,Tweet_preprocessed_dl
0,0,awww thats a bummer you shoulda got david car...,"awww, that's a bummer. you shoulda got david..."
1,0,is upset that he cant update his facebook by t...,is upset that he can't update his facebook by ...
2,0,i dived many times for the ball managed to sav...,i dived many times for the ball. managed to sa...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i her...,"no, it's not behaving at all. i'm mad. why am ..."


In [71]:
y = df['Target'].copy()
X = df['Tweet_preprocessed'].copy()

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [73]:
X_train.head()

612396     hey splogin connect speed should be ok ram may...
1445587    again wit the socks 2day huh hehe i like them ...
1310255               hey chick how you feeling this morning
351809       i feel like i never chill with you guys anymore
204527                               yep thats what happened
Name: Tweet_preprocessed, dtype: object

In [74]:
X_train[X_train.isna()]

Series([], Name: Tweet_preprocessed, dtype: object)

In [75]:
vectorizer_name = "tfidf"

vectorizer_params = {"stop_words": "english", "max_df": 0.95, "min_df": 1}

vectorizers = {
    'count-vectorizer': CountVectorizer(**vectorizer_params),
    'tfidf': TfidfVectorizer(**vectorizer_params)
}

vectorizer = vectorizers[vectorizer_name]

vectorizer.fit(X_train)

X_train = vectorizer.transform(X_train)
X_test = vectorizer.transform(X_test)

In [76]:
tpe_algorithm = tpe.suggest

bayes_trials = Trials()

with mlflow.start_run(run_name="hyperopt_logistic") as run:
    best = fmin(fn=objective, space=space, algo=tpe_algorithm, max_evals=10, trials=bayes_trials)
    best = {key:float(value) for key, value in best.items()}
    
    
    mlflow.log_dict(best, "best_params.json")

100%|██████████| 10/10 [02:46<00:00, 16.68s/trial, best loss: 0.22299999999999998]


In [77]:
print(best)

{'C': 0.6407203973611235, 'tol': 2.1539469728385794e-05}


In [None]:
from time import perf_counter
from itertools import product

params = {
    'tol': [0.00001, 0.0001],
    'C': [0.05, 0.1, 0.5, 1]
}


with mlflow.start_run():
    for t, c in product(params['tol'], params['C']):
    
        mlflow.log_params({"vectorizer": vectorizer_name})
        
        print(f"training model with params: tol:{t}, C:{c}")
        
        model = LogisticRegression(tol=t, C=c, max_iter=1000)
        
        start = perf_counter()

        model.fit(X_train, y_train)
        
        
        y_pred = model.predict(X_test)
        
        end = perf_counter()
        
        duration = round(end - start, 2)
        
        score = accuracy_score(y_test, y_pred)
        
        print(f"{score=}")
        mlflow.log_metric("accuracy", score)
        mlflow.log_metric("duration", duration)



training model with params: tol:1e-05, C:0.05




score=0.7682100939997142
training model with params: tol:1e-05, C:0.1




score=0.772068871276305
training model with params: tol:1e-05, C:0.5




score=0.7770158287595498
training model with params: tol:1e-05, C:1




score=0.7774270304966264
training model with params: tol:0.0001, C:0.05




score=0.7671770750016924
training model with params: tol:0.0001, C:0.1




score=0.772093944552956
training model with params: tol:0.0001, C:0.5




score=0.7768678964273088
training model with params: tol:0.0001, C:1




score=0.7766898761630866


In [None]:
mlflow.set_experiment("Bert_model")

with mlflow.start_run():
    #mlflow.log_params({"vectorizer": vectorizer_name})
    
    start = perf_counter()

    bert_model.fit
    
    
    y_pred = bert_model.predict(X_test)
    
    end = perf_counter()
    
    duration = round(end - start, 2)
    
    score = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", score)
    mlflow.log_metric("duration", duration)