In [None]:
%pip install datasets



In [None]:
from datasets import load_dataset
ds = load_dataset("jahjinx/IMDb_movie_reviews")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 36000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
})

In [None]:
import pandas as pd
train_df = pd.DataFrame(ds['train'])
val_df = pd.DataFrame(ds['validation'])
test_df = pd.DataFrame(ds['test'])

In [None]:
# remove duplicates
train_df.drop_duplicates(inplace=True)
val_df.drop_duplicates(inplace=True)
test_df.drop_duplicates(inplace=True)

In [None]:
print(f"Shape of training data: {train_df.shape}")
print(f"Shape of validation data: {val_df.shape}")
print(f"Shape of testing data: {test_df.shape}")

Shape of training data: (35767, 2)
Shape of validation data: (3999, 2)
Shape of testing data: (9981, 2)


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure necessary NLTK data is downloaded
nltk.download('stopwords')
nltk.download('wordnet')

# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove URLs
    comment = re.sub(r'https?://\S+|www\.\S+', '', comment)

    # Remove HTML tags
    comment = re.sub(r'<.*?>', '', comment)

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters and punctuation
    comment = re.sub(r'[^\w\s]', '', comment)


    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
train_df['text'] = train_df['text'].apply(preprocess_comment)
val_df['text'] = val_df['text'].apply(preprocess_comment)
test_df['text'] = test_df['text'].apply(preprocess_comment)

In [None]:
%pip install mlflow dagshub



In [None]:
import dagshub
dagshub.init(repo_owner='abhishekramgarh13', repo_name='sentiment-analysis', mlflow=True)

Output()



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=905e620a-b3ad-4058-bab6-0a20cf4b618b&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=d78d97eebaf55da3f6a91de9665b0da23a16573d251d828f940fa2ca9cfd56a8




In [None]:
import mlflow

# setting tracking server
mlflow.set_tracking_uri('https://dagshub.com/abhishekramgarh13/sentiment-analysis.mlflow')

In [None]:
mlflow.set_experiment("Model-Selection")

<Experiment: artifact_location='mlflow-artifacts:/eb66d38530584046ad0eae80a7e64ef9', creation_time=1738027704054, experiment_id='1', last_update_time=1738027704054, lifecycle_stage='active', name='Model-Selection', tags={}>

In [None]:
%pip install optuna

Collecting optuna
  Downloading optuna-4.2.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.0-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.4/383.4 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.2.0


In [None]:
import optuna
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=2000)
X_train = tfidf.fit_transform(train_df['text'])
X_test = tfidf.transform(test_df['text'])
X_val = tfidf.transform(val_df['text'])
y_train = train_df['label']
y_test = test_df['label']
y_val = val_df['label']

In [None]:
#Function to log results in MLflow
def log_mlflow(model_name, model, X_train, X_test, X_val ,y_train, y_test,y_val,best_params):
    with mlflow.start_run():
        # Log model type
        mlflow.set_tag("mlflow.runName", f"{model_name}")
        mlflow.set_tag("experiment_type", "algorithm_comparison")

        # Log algorithm name as a parameter
        mlflow.log_param("algo_name", model_name)
        #log parameter of model
        mlflow.log_params(best_params)

        # Train model
        model.fit(X_train, y_train)
        y_pred_test = model.predict(X_test)
        y_pred_val = model.predict(X_val)



        # Log accuracy
        test_accuracy = accuracy_score(y_test, y_pred_test)
        mlflow.log_metric("test_accuracy", test_accuracy)
        val_accuracy = accuracy_score(y_val, y_pred_val)
        mlflow.log_metric("val_accuracy", val_accuracy)

        # Log classification report
        classification_rep = classification_report(y_test, y_pred_test, output_dict=True)
        for label, metrics in classification_rep.items():
            if isinstance(metrics, dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}", value)

        # Log the model
        mlflow.sklearn.log_model(model, f"{model_name}_model")

In [None]:
# Step 6: Optuna objective function for KNN
def objective_KNN(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 3, 30)
    p = trial.suggest_categorical('p', [1, 2])
    model = KNeighborsClassifier(n_neighbors=n_neighbors, p=p)

    return accuracy_score(y_val, model.fit(X_train, y_train).predict(X_val))

In [None]:
def run_optuna_experiment():
    study = optuna.create_study(direction="maximize")
    study.optimize(objective_KNN, n_trials=20)

    # Get the best parameters and log only the best model
    best_params = study.best_params
    best_model = KNeighborsClassifier(n_neighbors=best_params['n_neighbors'],p=best_params['p'])
    # Log the best model with MLflow, passing the algo_name as "KNN"
    log_mlflow("KNN", best_model, X_train, X_test,X_val, y_train, y_test,y_val,best_params)

# Run the experiment for KNN
run_optuna_experiment()