In [21]:
import pandas as pd

df=pd.read_csv("../data/sentiment_data/tweets_with_sentiment_1.csv")
print(df.shape)


(50000, 6)


## Preprocessing 

In [22]:
print("Columns in the dataset")
print(df.columns)
columns_to_drop = ['hashtags', 'is_retweet', 'lang','date'] 
#remove the columns which are not needed for the analysis

df = df.drop(columns=columns_to_drop)
print("Columns after dropping some columns")
print(df.columns)
print("New shape of the dataset ",df.shape)


Columns in the dataset
Index(['date', 'text', 'hashtags', 'is_retweet', 'lang', 'sentiment'], dtype='object')
Columns after dropping some columns
Index(['text', 'sentiment'], dtype='object')
New shape of the dataset  (50000, 2)


In [23]:
print(df.isnull().sum())
df = df[df['text'].notna()]  # Remove rows with NaN in 'text'


text         1
sentiment    0
dtype: int64


In [24]:
import emoji
import spacy
import re

# Load spaCy English model for lemmatization
nlp = spacy.load("en_core_web_sm")

def preprocess_text(text):
    # 1. Convert emojis to text like ":smile:"
    text = emoji.demojize(text, delimiters=(" ", " "))  # spaces instead of colons for cleaner tokenization
    
    # 2. Lowercase the text
    text = text.lower()
    
    
    # 3. Use spaCy to tokenize and lemmatize
    doc = nlp(text)
    
    # 4. Keep only lemmas that are alphabetic (filter out punctuation, numbers)
    lemmas = [token.lemma_ for token in doc if token.is_alpha]
    
    # 5. Join back to string
    return " ".join(lemmas)


In [26]:
df['preprocess_text']=df['text'].apply(preprocess_text)

## Vectorization of the text

In [39]:
X=df['preprocess_text']
y=df['sentiment']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer=TfidfVectorizer(
    stop_words='english',
    lowercase=True, 
    max_features=10000,
    min_df=0.01# remove words that appear in less than 1% of the documents
)



X_train_tfidf=vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("Shape of the feature matrix X: ",X_train_tfidf.shape)
print("Shape of the target vector Y: ",y_train.shape)


Shape of the feature matrix X:  (39999, 121)
Shape of the target vector Y:  (39999,)


## Random Forest model

In [45]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report
from scipy.stats import randint

# Define the parameter distribution
param_dist = {
    'n_estimators': randint(100, 500),
    'max_depth': randint(50, 1000),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

# Initialize the classifier
rf = RandomForestClassifier(random_state=42)

# Setup RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=20,  # Try 20 random combinations
    cv=5,
    scoring='accuracy',  # Change to f1_macro, roc_auc, etc. if needed
    n_jobs=-1,
    verbose=2,
    random_state=42
)

# Fit the model
random_search.fit(X_train_tfidf, y_train)

# Get best model
best_rf = random_search.best_estimator_
print("Best Parameters:", random_search.best_params_)

# Predict and evaluate
y_pred = best_rf.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best Parameters: {'bootstrap': False, 'criterion': 'entropy', 'max_depth': 51, 'min_samples_leaf': 2, 'min_samples_split': 7, 'n_estimators': 153}
              precision    recall  f1-score   support

    negative       0.90      0.38      0.53      1392
     neutral       0.66      0.86      0.75      4295
    positive       0.78      0.70      0.74      4313

    accuracy                           0.72     10000
   macro avg       0.78      0.65      0.67     10000
weighted avg       0.75      0.72      0.71     10000

