In [2]:
import kagglehub

path = kagglehub.dataset_download("saurabhshahane/twitter-sentiment-dataset")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/bilal/.cache/kagglehub/datasets/saurabhshahane/twitter-sentiment-dataset/versions/1


In [3]:
import pandas as pd

In [4]:
data = pd.read_csv(path + "/Twitter_Data.csv")

In [5]:
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162980 entries, 0 to 162979
Data columns (total 2 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   clean_text  162976 non-null  object 
 1   category    162973 non-null  float64
dtypes: float64(1), object(1)
memory usage: 2.5+ MB


In [7]:
data.shape

(162980, 2)

In [8]:
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


In [9]:
data.dropna(inplace=True)  

In [10]:
data['clean_text'] = data['clean_text'].str.lower() 

In [11]:
data['clean_text'] = data['clean_text'].str.replace(r'[^\w\s]', '', regex=True)  

In [12]:
data['category'] = data['category'].astype(int)  

In [13]:
tfidf = TfidfVectorizer(max_features=500)
X = tfidf.fit_transform(data['clean_text'])
y = data['category']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
models = {
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": MultinomialNB(),
}

In [16]:
results = {}


In [17]:
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[model_name] = accuracy
    print(f"{model_name}:")
    print(classification_report(y_test, y_pred))

# Best model
best_model = max(results, key=results.get)
print(f"Best Model: {best_model} with accuracy {results[best_model]:.2f}")


Random Forest:
              precision    recall  f1-score   support

          -1       0.80      0.48      0.60      7152
           0       0.69      0.92      0.79     11067
           1       0.83      0.80      0.81     14375

    accuracy                           0.77     32594
   macro avg       0.78      0.73      0.73     32594
weighted avg       0.78      0.77      0.76     32594

Logistic Regression:
              precision    recall  f1-score   support

          -1       0.78      0.48      0.59      7152
           0       0.68      0.95      0.79     11067
           1       0.87      0.77      0.81     14375

    accuracy                           0.76     32594
   macro avg       0.77      0.73      0.73     32594
weighted avg       0.78      0.76      0.76     32594

Gradient Boosting:
              precision    recall  f1-score   support

          -1       0.82      0.43      0.56      7152
           0       0.68      0.84      0.75     11067
           1       0

In [25]:
best_model = max(results, key=results.get)
print(f"Best Model: {best_model} with accuracy {results[best_model]:.2f}")

final_model = models[best_model]


Best Model: Random Forest with accuracy 0.77


In [19]:
import joblib


# joblib.dump(final_model, "model.pkl")
joblib.dump(tfidf, "tfidf.pkl")

['tfidf.pkl']

In [40]:
import pickle
import os

def split_model(model, output_dir, chunk_size=50 * 1024 * 1024):
    """
    Split a large model into chunks and save them as .pkl files.

    Parameters:
    - model: The large model object to be saved.
    - output_dir: Directory where chunks will be saved.
    - chunk_size: Maximum size of each chunk in bytes (default: 50MB).
    """
    os.makedirs(output_dir, exist_ok=True)

    data = pickle.dumps(model)

    for i in range(0, len(data), chunk_size):
        chunk_path = os.path.join(output_dir, f"chunk_{i // chunk_size}.pkl")
        with open(chunk_path, 'wb') as chunk_file:
            chunk_file.write(data[i:i + chunk_size])

    print(f"Model saved in chunks to '{output_dir}'. Total chunks: {len(data) // chunk_size + 1}")


split_model(final_model, output_dir='model_chunks')


Model saved in chunks to 'model_chunks'. Total chunks: 11
