In [1]:
import os
os.system("pip install -q dagshub mlflow")

0

In [None]:
import os
import mlflow
from mlflow.tracking import MlflowClient
from mlflow.models import infer_signature
import dagshub

In [None]:
dagshub.init(repo_owner='YogeshKumar-saini', repo_name='Fake-Review-Detection', mlflow=True)

# Set tracking URI
mlflow.set_tracking_uri("https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow")


client = MlflowClient()
default_experiment = client.get_experiment_by_name("Fake Review Detection")

runs = client.search_runs(experiment_ids=[default_experiment.experiment_id])

for run in runs:
    try:
        client.delete_run(run.info.run_id)
        print(f"Deleted run {run.info.run_id} from experiment {default_experiment.name}")
    except Exception as e:
        print(f"Could not delete run {run.info.run_id}: {e}")


In [None]:
import os
import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import mlflow
import mlflow.sklearn
import mlflow.keras
import mlflow.data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, ExtraTreesClassifier
from sklearn.svm import SVC
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional



mlflow.set_tracking_uri("https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow")
mlflow.set_experiment("Fake Review Detection")



feature_files = [
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/Feature-engineered/preprocessed_lemmatization_features.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/Feature-engineered/preprocessed_no_stopwords_features.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/Feature-engineered/preprocessed_no_stopwords_no_lemmatization_features.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/Feature-engineered/preprocessed_stemming_features.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/Feature-engineered/preprocessed_stemming_no_stopwords_features.csv"
]

embedding_files = [
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_lemmatization_bert.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_lemmatization_glove.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_lemmatization_tfidf.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_bert.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_glove.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_no_lemmatization_bert.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_no_lemmatization_glove.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_no_lemmatization_tfidf.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_no_stopwords_tfidf.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_bert.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_glove.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_no_stopwords_bert.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_no_stopwords_glove.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_no_stopwords_tfidf.csv",
    "/home/yogesh/mlops/Mlop Projects/Fake Review Detection/data/embeddings/preprocessed_stemming_tfidf.csv"
]

files = feature_files + embedding_files



models = {
    "LogisticRegression": (
        LogisticRegression,
        {"C": [0.01, 0.1, 1], "solver": ["liblinear"], "max_iter": [100, 200]}
    ),
    "RandomForest": (
        RandomForestClassifier,
        {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
    ),
    "SVC": (
        SVC,
        {"C": [0.1, 1, 10], "kernel": ["linear", "rbf"]}
    ),
    "XGBoost": (
        xgb.XGBClassifier,
        {"n_estimators": [50, 100, 200], "max_depth": [3, 5, 7], "learning_rate": [0.01, 0.1, 0.2]}
    ),
    "GradientBoosting": (
        GradientBoostingClassifier,
        {"n_estimators": [50, 100, 200], "max_depth": [3, 5, 7], "learning_rate": [0.01, 0.1, 0.2]}
    ),
    "AdaBoost": (
        AdaBoostClassifier,
        {"n_estimators": [50, 100, 200], "learning_rate": [0.01, 0.1, 1]}
    ),
    "ExtraTrees": (
        ExtraTreesClassifier,
        {"n_estimators": [50, 100, 200], "max_depth": [None, 10, 20]}
    )
}



def build_1LSTM(vocab_size, max_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
    model.add(LSTM(64, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

def build_2LSTM(vocab_size, max_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
    model.add(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
    model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

def build_BiLSTM(vocab_size, max_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
    model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

def build_2BiLSTM(vocab_size, max_length, num_classes):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length))
    model.add(Bidirectional(LSTM(64, return_sequences=True, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.2))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

text_models = {
    "1LSTM": build_1LSTM,
    "2LSTM": build_2LSTM,
    "BiLSTM": build_BiLSTM,
    "2BiLSTM": build_2BiLSTM
}



def build_dense_model(input_dim, num_classes):
    model = Sequential()
    model.add(Dense(256, activation="relu", input_dim=input_dim))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(64, activation="relu"))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation="softmax"))
    model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model



progress_file = "progress_log.csv"
if os.path.exists(progress_file):
    dfp = pd.read_csv(progress_file)
    processed_keys = set(dfp["run_key"].tolist())
else:
    processed_keys = set()

def update_progress_log(new_row):
    dfp_new = pd.DataFrame([new_row],
        columns=["run_key", "File", "Model", "Accuracy", "Precision", "Recall", "F1"])
    if os.path.exists(progress_file):
        dfp_new.to_csv(progress_file, mode='a', index=False, header=False)
    else:
        dfp_new.to_csv(progress_file, index=False)

def log_confusion_matrix(y_true, y_pred, run_key, prefix):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    cm_path = f"/home/yogesh/mlops/Mlop Projects/Fake Review Detection/reports/figures/confusion_matrix_{prefix}_{run_key}.png"
    plt.savefig(cm_path)
    mlflow.log_artifact(cm_path)
    plt.close()

def log_dataset(df, source_file):
    try:
        ds = mlflow.data.from_pandas(df, source=source_file)
        mlflow.data.log_dataset(ds, name="dataset")
    except Exception:
        mlflow.log_artifact(source_file, artifact_path="dataset_csv")



def run_ml_experiments(file):
    if not os.path.exists(file):
        return
    df = pd.read_csv(file)
    if "label" not in df.columns:
        return
    df.dropna(inplace=True)
    y = df["label"].values
    if y.dtype == object:
        le = LabelEncoder()
        y = le.fit_transform(y)
    
    is_embedding_file = file in embedding_files

    if (file in feature_files) and ("processed_text" in df.columns) and ("lexical_diversity" in df.columns):
        numeric_cols = [
            "lexical_diversity", "avg_word_length", "sentiment_polarity",
            "subjectivity", "flesch_reading_ease", "sentence_length",
            "named_entity_count", "noun_count", "verb_count", "adj_count", "adv_count"
        ]
        available_cols = [col for col in numeric_cols if col in df.columns]
        X = df[available_cols].values
    else:
        X = df.drop(columns=["label"]).values

    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    name_prefix = os.path.basename(file)
    
    for m_name, (ModelClass, param_grid) in models.items():
        run_key = f"{name_prefix}_{m_name}"
        if run_key in processed_keys:
            continue
        with mlflow.start_run(run_name=f"{m_name}_on_{name_prefix}"):
            mlflow.log_param("dataset_file", file)
            mlflow.log_param("dataset_type", "embedding" if is_embedding_file else "feature")
            mlflow.log_param("dataset_shape", X.shape)
            if is_embedding_file:
                mlflow.log_param("embedding_csv", file)
            mlflow.log_param("model_type", m_name)
            log_dataset(df, file)
            
            gs = GridSearchCV(ModelClass(), param_grid, cv=3, scoring="accuracy", n_jobs=1)
            gs.fit(X_train, y_train)
            best_model = gs.best_estimator_
            preds = best_model.predict(X_test)
            
            acc = accuracy_score(y_test, preds)
            prec = precision_score(y_test, preds, average="weighted")
            rec = recall_score(y_test, preds, average="weighted")
            f1 = f1_score(y_test, preds, average="weighted")
            
            mlflow.log_params(gs.best_params_)
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("precision", prec)
            mlflow.log_metric("recall", rec)
            mlflow.log_metric("f1_score", f1)
            
            mlflow.sklearn.log_model(best_model, artifact_path=f"{m_name}_Model")
            log_confusion_matrix(y_test, preds, run_key, "ML")
        mlflow.end_run()
        update_progress_log([run_key, file, m_name, acc, prec, rec, f1])
        processed_keys.add(run_key)


def run_dl_text_experiments(file):
    if not os.path.exists(file):
        return
    df = pd.read_csv(file)
    if "label" not in df.columns or "processed_text" not in df.columns:
        return
    df.dropna(inplace=True)
    y = df["label"].values
    if y.dtype == object:
        le = LabelEncoder()
        y = le.fit_transform(y)
    
    texts = df["processed_text"].fillna("").astype(str).tolist()
    
    
    vocab_size = 10000
    max_length = 200
    tokenizer_obj = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
    tokenizer_obj.fit_on_texts(texts)
    sequences = tokenizer_obj.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=max_length, padding="post", truncating="post")
    
    X_text = padded
    X_train, X_test, y_train, y_test = train_test_split(
        X_text, y, test_size=0.2, random_state=42
    )
    
    num_classes = len(np.unique(y))
    name_prefix = os.path.basename(file)
    
    for model_name, build_fn in text_models.items():
        run_key = f"{model_name}_{name_prefix}"
        if run_key in processed_keys:
            continue
        with mlflow.start_run(run_name=f"{model_name}_on_{name_prefix}"):
            mlflow.log_param("dataset_file", file)
            mlflow.log_param("model_type", model_name)
            mlflow.log_param("text_in_file", "Yes")
            mlflow.log_param("vocab_size", vocab_size)
            mlflow.log_param("max_length", max_length)
            mlflow.log_param("num_classes", num_classes)
            mlflow.log_param("dataset_shape", X_text.shape)
            log_dataset(df, file)
            
            model = build_fn(vocab_size, max_length, num_classes)
            model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=0)
            
            loss, acc = model.evaluate(X_test, y_test, verbose=0)
            preds_prob = model.predict(X_test)
            preds = preds_prob.argmax(axis=1)
            
            prec = precision_score(y_test, preds, average="weighted")
            rec = recall_score(y_test, preds, average="weighted")
            f1 = f1_score(y_test, preds, average="weighted")
            
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("precision", prec)
            mlflow.log_metric("recall", rec)
            mlflow.log_metric("f1_score", f1)
            
            mlflow.keras.log_model(model, artifact_path=f"{model_name}_Model")
            log_confusion_matrix(y_test, preds, run_key, "DL")
        mlflow.end_run()
        update_progress_log([run_key, file, model_name, acc, prec, rec, f1])
        processed_keys.add(run_key)

def run_dl_embedding_experiments(file):
    if not os.path.exists(file):
        return
    df = pd.read_csv(file)
    if "label" not in df.columns:
        return
    df.dropna(inplace=True)
    y = df["label"].values
    if y.dtype == object:
        le = LabelEncoder()
        y = le.fit_transform(y)
    
    X = df.drop(columns=["label"]).values
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    name_prefix = os.path.basename(file)
    run_key = f"DenseNN_{name_prefix}"
    if run_key in processed_keys:
        return
    with mlflow.start_run(run_name=f"DenseNN_on_{name_prefix}"):
        mlflow.log_param("dataset_file", file)
        mlflow.log_param("embedding_file", "Yes")
        mlflow.log_param("model_type", "Dense NN on Embeddings")
        mlflow.log_param("dataset_shape", X.shape)
        log_dataset(df, file)
        
        input_dim = X.shape[1]
        num_classes = len(np.unique(y))
        model = build_dense_model(input_dim, num_classes)
        model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1, verbose=0)
        
        loss, acc = model.evaluate(X_test, y_test, verbose=0)
        preds = model.predict(X_test).argmax(axis=1)
        
        prec = precision_score(y_test, preds, average="weighted")
        rec = recall_score(y_test, preds, average="weighted")
        f1 = f1_score(y_test, preds, average="weighted")
        
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("precision", prec)
        mlflow.log_metric("recall", rec)
        mlflow.log_metric("f1_score", f1)
        
        mlflow.keras.log_model(model, artifact_path="DenseNN_Model")
        log_confusion_matrix(y_test, preds, run_key, "DL")
    mlflow.end_run()
    update_progress_log([run_key, file, "Dense NN on Embeddings", acc, prec, rec, f1])
    processed_keys.add(run_key)

for f in files:
    if not os.path.exists(f):
        continue
    run_ml_experiments(f)
    df_temp = pd.read_csv(f)
    if "processed_text" in df_temp.columns:
        run_dl_text_experiments(f)
    if f in embedding_files:
        run_dl_embedding_experiments(f)

print("All experiments completed.")


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


🏃 View run LogisticRegression_on_preprocessed_lemmatization_features.csv at: https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow/#/experiments/0/runs/72a9398d65bd4f8a998f7cc76e3b259a
🧪 View experiment at: https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow/#/experiments/0


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


🏃 View run RandomForest_on_preprocessed_lemmatization_features.csv at: https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow/#/experiments/0/runs/d23b623cc1514a5cb41f4ceeb1f02564
🧪 View experiment at: https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow/#/experiments/0


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


🏃 View run SVC_on_preprocessed_lemmatization_features.csv at: https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow/#/experiments/0/runs/9419da789ef64251bf024f77cd199b84
🧪 View experiment at: https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow/#/experiments/0


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(


🏃 View run XGBoost_on_preprocessed_lemmatization_features.csv at: https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow/#/experiments/0/runs/b2d916b50b5b4ed8a0d730a9ac5cbea7
🧪 View experiment at: https://dagshub.com/YogeshKumar-saini/Fake-Review-Detection.mlflow/#/experiments/0


  return _dataset_source_registry.resolve(
  return _dataset_source_registry.resolve(
