In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("Twitter_Data.csv")

In [3]:
df


Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0
...,...,...
162975,why these 456 crores paid neerav modi not reco...,-1.0
162976,dear rss terrorist payal gawar what about modi...,-1.0
162977,did you cover her interaction forum where she ...,0.0
162978,there big project came into india modi dream p...,0.0


In [4]:
df.isnull().sum()

clean_text    4
category      7
dtype: int64

In [5]:
df.shape

(162980, 2)

In [6]:
df = df.dropna(subset=['clean_text'])


In [7]:
df = df.dropna(subset=['category'])


In [8]:
df.isnull().sum()

clean_text    0
category      0
dtype: int64

In [9]:
df.shape

(162969, 2)

In [10]:
X = df['clean_text']
y = df['category']


In [11]:
X.shape

(162969,)

In [12]:
y.shape

(162969,)

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [14]:
X_train.shape

(130375,)

In [15]:
X_test.shape

(32594,)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english'
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [17]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [18]:
y_pred = model.predict(X_test_vec)


In [19]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.8550653494508191
              precision    recall  f1-score   support

        -1.0       0.86      0.74      0.80      7152
         0.0       0.81      0.96      0.88     11067
         1.0       0.90      0.83      0.87     14375

    accuracy                           0.86     32594
   macro avg       0.85      0.84      0.85     32594
weighted avg       0.86      0.86      0.85     32594



In [20]:
tweet = ["Government policies are both less good and more bad"]
tweet_vec = vectorizer.transform(tweet)

model.predict(tweet_vec)


array([1.])

In [21]:
import mlflow
print(mlflow.__version__)


3.8.1


In [22]:
import sys
!"{sys.executable}" -m pip install mlflow





In [23]:
import mlflow
print(mlflow.__version__)


3.8.1


In [24]:
mlflow.set_tracking_uri("file:./mlruns")


In [25]:
mlflow.set_experiment("Twitter Sentiment Analysis")


  return FileStore(store_uri, store_uri)


<Experiment: artifact_location='file:C:/Users/ANSHUL MODI/mlruns/580627740578866403', creation_time=1768223961462, experiment_id='580627740578866403', last_update_time=1768223961462, lifecycle_stage='active', name='Twitter Sentiment Analysis', tags={'mlflow.experimentKind': 'custom_model_development'}>

In [26]:
print("Active experiment:", mlflow.get_experiment_by_name("Twitter Sentiment Analysis").name)


Active experiment: Twitter Sentiment Analysis


In [27]:
with mlflow.start_run():
    mlflow.log_param("test_param", "working")
    mlflow.log_metric("test_metric", 1.0)


In [28]:
import mlflow
import mlflow.sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score, f1_score


In [29]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Linear SVM": LinearSVC(),
    "KNN": KNeighborsClassifier(n_neighbors=5)
}


In [30]:
for model_name, model in models.items():

    with mlflow.start_run(run_name=model_name):

        # Train model
        model.fit(X_train_vec, y_train)

        # Predict
        y_pred = model.predict(X_test_vec)

        # Metrics
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        # Log parameters
        mlflow.log_param("model_name", model_name)

        # Log metrics
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)

        # Log model
        mlflow.sklearn.log_model(model, name="model")

        print(f"✅ {model_name} logged | Accuracy={accuracy:.4f}, F1={f1:.4f}")


✅ Logistic Regression logged | Accuracy=0.8551, F1=0.8538
✅ Naive Bayes logged | Accuracy=0.7042, F1=0.6934
✅ Linear SVM logged | Accuracy=0.8613, F1=0.8603
✅ KNN logged | Accuracy=0.4331, F1=0.3614


In [31]:
import mlflow
import mlflow.sklearn

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score


In [32]:
mlflow.set_experiment("Twitter Sentiment Analysis - Hyperparameter Tuning")


<Experiment: artifact_location='file:C:/Users/ANSHUL MODI/mlruns/390283142526069425', creation_time=1768225188983, experiment_id='390283142526069425', last_update_time=1768225188983, lifecycle_stage='active', name='Twitter Sentiment Analysis - Hyperparameter Tuning', tags={}>

In [33]:
logreg_params = {
    "C": [0.01, 0.1, 1, 10],
    "solver": ["liblinear", "lbfgs"]
}


In [34]:
svm_params = {
    "C": [0.01, 0.1, 1, 10]
}


In [35]:
for C in logreg_params["C"]:
    for solver in logreg_params["solver"]:

        with mlflow.start_run(run_name=f"LogReg_C={C}_solver={solver}"):

            model = LogisticRegression(
                C=C,
                solver=solver,
                max_iter=1000
            )

            model.fit(X_train_vec, y_train)
            y_pred = model.predict(X_test_vec)

            acc = accuracy_score(y_test, y_pred)
            f1 = f1_score(y_test, y_pred, average="weighted")

            # Log params
            mlflow.log_param("model", "LogisticRegression")
            mlflow.log_param("C", C)
            mlflow.log_param("solver", solver)

            # Log metrics
            mlflow.log_metric("accuracy", acc)
            mlflow.log_metric("f1_score", f1)

            # Log model
            mlflow.sklearn.log_model(model, "model")

            print(f"LogReg | C={C}, solver={solver} | F1={f1:.4f}")




LogReg | C=0.01, solver=liblinear | F1=0.5358




LogReg | C=0.01, solver=lbfgs | F1=0.6058




LogReg | C=0.1, solver=liblinear | F1=0.7810




LogReg | C=0.1, solver=lbfgs | F1=0.8019




LogReg | C=1, solver=liblinear | F1=0.8494




LogReg | C=1, solver=lbfgs | F1=0.8538




LogReg | C=10, solver=liblinear | F1=0.8595




LogReg | C=10, solver=lbfgs | F1=0.8555


In [36]:
for C in svm_params["C"]:

    with mlflow.start_run(run_name=f"SVM_C={C}"):

        model = LinearSVC(C=C)

        model.fit(X_train_vec, y_train)
        y_pred = model.predict(X_test_vec)

        acc = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average="weighted")

        # Log params
        mlflow.log_param("model", "LinearSVM")
        mlflow.log_param("C", C)

        # Log metrics
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)

        # Log model
        mlflow.sklearn.log_model(model, "model")

        print(f"SVM | C={C} | F1={f1:.4f}")




SVM | C=0.01 | F1=0.7797




SVM | C=0.1 | F1=0.8505




SVM | C=1 | F1=0.8603




SVM | C=10 | F1=0.8605


In [37]:
import mlflow
import mlflow.sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score


In [38]:
mlflow.set_experiment("TFIDF Hyperparameter Tuning")


<Experiment: artifact_location='file:C:/Users/ANSHUL MODI/mlruns/613819196069423681', creation_time=1768225407755, experiment_id='613819196069423681', last_update_time=1768225407755, lifecycle_stage='active', name='TFIDF Hyperparameter Tuning', tags={}>

In [39]:
tfidf_params = {
    "max_features": [3000, 5000, 8000],
    "ngram_range": [(1, 1), (1, 2)],
    "min_df": [1, 3, 5]
}


In [None]:
for max_feat in tfidf_params["max_features"]:
    for ngram in tfidf_params["ngram_range"]:
        for min_df in tfidf_params["min_df"]:

            run_name = f"TFIDF_mf={max_feat}_ng={ngram}_minDf={min_df}"

            with mlflow.start_run(run_name=run_name):

                # TF-IDF
                vectorizer = TfidfVectorizer(
                    max_features=max_feat,
                    ngram_range=ngram,
                    min_df=min_df,
                    stop_words="english"
                )

                X_train_vec = vectorizer.fit_transform(X_train)
                X_test_vec = vectorizer.transform(X_test)

                # Model
                model = LinearSVC(C=1.0)
                model.fit(X_train_vec, y_train)

                # Predict
                y_pred = model.predict(X_test_vec)

                # Metrics
                acc = accuracy_score(y_test, y_pred)
                f1 = f1_score(y_test, y_pred, average="weighted")

                # Log TF-IDF params
                mlflow.log_param("model", "LinearSVM")
                mlflow.log_param("tfidf_max_features", max_feat)
                mlflow.log_param("tfidf_ngram_range", str(ngram))
                mlflow.log_param("tfidf_min_df", min_df)

                # Log metrics
                mlflow.log_metric("accuracy", acc)
                mlflow.log_metric("f1_score", f1)

                # Log artifacts
                mlflow.sklearn.log_model(model, "model")
                mlflow.sklearn.log_model(vectorizer, "tfidf")

                print(f"{run_name} | F1={f1:.4f}")




TFIDF_mf=3000_ng=(1, 1)_minDf=1 | F1=0.8435




TFIDF_mf=3000_ng=(1, 1)_minDf=3 | F1=0.8437




TFIDF_mf=3000_ng=(1, 1)_minDf=5 | F1=0.8436




TFIDF_mf=3000_ng=(1, 2)_minDf=1 | F1=0.8315




TFIDF_mf=3000_ng=(1, 2)_minDf=3 | F1=0.8318




TFIDF_mf=3000_ng=(1, 2)_minDf=5 | F1=0.8316




TFIDF_mf=5000_ng=(1, 1)_minDf=1 | F1=0.8603




TFIDF_mf=5000_ng=(1, 1)_minDf=3 | F1=0.8607




TFIDF_mf=5000_ng=(1, 1)_minDf=5 | F1=0.8603




TFIDF_mf=5000_ng=(1, 2)_minDf=1 | F1=0.8502




TFIDF_mf=5000_ng=(1, 2)_minDf=3 | F1=0.8501




TFIDF_mf=5000_ng=(1, 2)_minDf=5 | F1=0.8499




TFIDF_mf=8000_ng=(1, 1)_minDf=1 | F1=0.8675




TFIDF_mf=8000_ng=(1, 1)_minDf=3 | F1=0.8674




TFIDF_mf=8000_ng=(1, 1)_minDf=5 | F1=0.8674




TFIDF_mf=8000_ng=(1, 2)_minDf=1 | F1=0.8598




In [None]:
import mlflow
import mlflow.sklearn

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score


In [None]:
mlflow.set_experiment("TFIDF + Model Joint Tuning")


In [None]:
tfidf_grid = {
    "max_features": [3000, 5000],
    "ngram_range": [(1, 1), (1, 2)],
    "min_df": [1, 3]
}

svm_grid = {
    "C": [0.1, 1, 10]
}


In [None]:
for max_feat in tfidf_grid["max_features"]:
    for ngram in tfidf_grid["ngram_range"]:
        for min_df in tfidf_grid["min_df"]:
            for C in svm_grid["C"]:

                run_name = (
                    f"TFIDF_mf={max_feat}_ng={ngram}_minDf={min_df}_C={C}"
                )

                with mlflow.start_run(run_name=run_name):

                    # TF-IDF
                    vectorizer = TfidfVectorizer(
                        max_features=max_feat,
                        ngram_range=ngram,
                        min_df=min_df,
                        stop_words="english"
                    )

                    X_train_vec = vectorizer.fit_transform(X_train)
                    X_test_vec = vectorizer.transform(X_test)

                    # Model
                    model = LinearSVC(C=C)
                    model.fit(X_train_vec, y_train)

                    # Predict
                    y_pred = model.predict(X_test_vec)

                    # Metrics
                    acc = accuracy_score(y_test, y_pred)
                    f1 = f1_score(y_test, y_pred, average="weighted")

                    # Log parameters
                    mlflow.log_param("model", "LinearSVM")
                    mlflow.log_param("C", C)
                    mlflow.log_param("tfidf_max_features", max_feat)
                    mlflow.log_param("tfidf_ngram_range", str(ngram))
                    mlflow.log_param("tfidf_min_df", min_df)

                    # Log metrics
                    mlflow.log_metric("accuracy", acc)
                    mlflow.log_metric("f1_score", f1)

                    # Log artifacts
                    mlflow.sklearn.log_model(model, "model")
                    mlflow.sklearn.log_model(vectorizer, "tfidf")

                    print(f"✅ {run_name} | F1={f1:.4f}")


In [None]:
import mlflow

model_uri = "runs:/d654c08c86594aba80c94cca4848276c/model"

mlflow.register_model(
    model_uri=model_uri,
    name="Twitter_Sentiment_Model"
)


In [None]:
model = mlflow.pyfunc.load_model(
    model_uri="models:/Twitter_Sentiment_Model/Production"
)


In [None]:
sample = vectorizer.transform(["Government policies are disappointing"])
model.predict(sample)


In [None]:
import mlflow

print("Tracking URI:", mlflow.get_tracking_uri())


In [None]:
import mlflow

# make sure tracking URI matches your project
mlflow.set_tracking_uri("file:./mlruns")

best_run_id = "<PASTE_YOUR_BEST_RUN_ID_HERE>"

mlflow.register_model(
    model_uri=f"runs:/{best_run_id}/model",
    name="Twitter_Sentiment_Model"
)
