### import dependencies

In [2]:
import nltk
import numpy as np
import pandas as pd
import re
import xgboost as xgb
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import spacy
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score, accuracy_score, recall_score, roc_auc_score

### Date ingestion

In [3]:
df = pd.read_csv('https://raw.githubusercontent.com/entbappy/Branching-tutorial/refs/heads/master/tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [4]:
df.drop(columns=['tweet_id'], axis=0, inplace=True)
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [5]:
df['sentiment'].value_counts()

sentiment
neutral       8638
worry         8459
happiness     5209
sadness       5165
love          3842
surprise      2187
fun           1776
relief        1526
hate          1323
empty          827
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [6]:
final_df = df[df['sentiment'].isin(["happiness", "sadness"])].copy()

In [7]:
final_df["sentiment"].value_counts()

sentiment
happiness    5209
sadness      5165
Name: count, dtype: int64

In [8]:
final_df.shape

(10374, 2)

In [9]:
final_df['sentiment'] = final_df['sentiment'].replace({
    'happiness': 0,
    'sadness': 1
}).astype(int)


In [10]:
final_df.head()

Unnamed: 0,sentiment,content
1,1,Layin n bed with a headache ughhhh...waitin o...
2,1,Funeral ceremony...gloomy friday...
6,1,"I should be sleep, but im not! thinking about ..."
8,1,@charviray Charlene my love. I miss you
9,1,@kelcouch I'm sorry at least it's Friday?


In [39]:
train_data,test_data = train_test_split(final_df, test_size=0.2, random_state=42)

### data preprocessing

In [12]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /home/breezy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/breezy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [46]:

# nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])
#
# stop_words = set(stopwords.words('english'))
# negative_words = {
#     'no', 'not', 'nor', 'don', "don't", 'ain', 'aren', "aren't",
#     'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't",
#     'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
#     'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
#     'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
#     'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
#     'wouldn', "wouldn't"
# }
# custom_stop_words = stop_words - negative_words
#
#
# # text lemmatization
# def lemmatization(text):
#     return " ".join([token.lemma_ for token in nlp(text)])
#
#
# # remove st
# def remove_stopwords(text):
#     return " ".join([i for i in str(text).split() if i not in custom_stop_words])
#
#
# # removing numbers
# def remove_numbers(text):
#     return ' '.join([i for i in text.split() if not i.isdigit()])
#
#
# # lower the cases
# def lower_case(text):
#     return text.lower()
#
#
# # remove punctuations
# def remove_punctuations(text):
#     return re.sub(r'[^\w\s]', '', text)
#
#
# # remove urls
# def remove_url(text):
#     return re.sub(r'http\S+|www\.\S+', '', text)
#
#
# def clean_text_pipline(text):
#     text = lower_case(text)
#     text = remove_url(text)
#     text = remove_stopwords(text)
#     text = remove_numbers(text)
#     text = remove_punctuations(text)
#     text = lemmatization(text)
#     return text
#
#
# # remove normalize
# def normalize_text(df):
#     df["content"] = df.content.apply(lambda content: clean_text_pipline(content))
#     return df
#
#
# # normalize sentence
# def normalize_sentence(text):
#     return clean_text_pipline(text)

In [48]:
import pandas as pd
import re
import spacy
from nltk.corpus import stopwords

nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

url_pattern = re.compile(r'https?://\S+|www\.\S+')
punct_pattern = re.compile(r'[^\w\s]')

stop_words = set(stopwords.words('english'))
negative_words = {
    'no', 'not', 'nor', 'don', "don't", 'ain', 'aren', "aren't",
    'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't",
    'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't",
    'isn', "isn't", 'mightn', "mightn't", 'mustn', "mustn't",
    'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
    'wasn', "wasn't", 'weren', "weren't", 'won', "won't",
    'wouldn', "wouldn't"
}
custom_stop_words = stop_words - negative_words
def normalize_text(df, text_column="content"):
    print("Starting Batch Normalization...")

    clean_series = df[text_column].astype(str).str.lower()
    clean_series = clean_series.str.replace(url_pattern, '', regex=True)
    clean_series = clean_series.str.replace(punct_pattern, '', regex=True)

    cleaned_texts = []


    for doc in nlp.pipe(clean_series.tolist(), batch_size=2000, n_process=-1):
        tokens = [
            token.lemma_ for token in doc
            if token.text not in custom_stop_words
            and not token.text.isdigit()
            and token.text.strip() != ''
        ]
        cleaned_texts.append(" ".join(tokens))

    df[text_column] = cleaned_texts
    print("Finished Batch Normalization...")
    return df

def normalize_sentence(text):

    text = str(text).lower()
    text = url_pattern.sub('', text)
    text = punct_pattern.sub('', text)
    doc = nlp(text)

    tokens = [
        token.lemma_ for token in doc
        if token.text not in custom_stop_words
        and not token.text.isdigit()
        and token.text.strip() != ''
    ]

    return " ".join(tokens)

In [49]:
test_data = normalize_text(test_data)
train_data = normalize_text(train_data)

Starting Batch Normalization...
Finished Batch Normalization...
Starting Batch Normalization...
Finished Batch Normalization...


### feature engineering

In [50]:
X_train = train_data['content']
y_train = train_data['sentiment']
X_test = test_data['content']
y_test = test_data['sentiment']

In [51]:
# apply the bag of words
vectorizer = CountVectorizer()

X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

#### optional thing(only for visualize the dataframe)

In [52]:
train_df = pd.DataFrame(X_train_bow.toarray())

In [53]:
feature_names =vectorizer.get_feature_names_out()

In [54]:
df_view = pd.DataFrame(X_train_bow.toarray(), columns=feature_names)

### Model building

In [55]:
xgb_model = xgb.XGBClassifier(eval_metric='logloss')
xgb_model.fit(X_train_bow, y_train)

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


In [56]:
y_pred = xgb_model.predict(X_test_bow)

### Model evaluation

In [57]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Accuracy: 0.7898795180722892
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.86      0.80      1015
           1       0.84      0.72      0.78      1060

    accuracy                           0.79      2075
   macro avg       0.80      0.79      0.79      2075
weighted avg       0.80      0.79      0.79      2075



In [58]:
# Make predictions
y_pred = xgb_model.predict(X_test_bow)
y_pred_proba = xgb_model.predict_proba(X_test_bow)[:, 1]

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred_proba)

In [59]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"AUC: {auc}")

Precision: 0.8443708609271523
Recall: 0.7216981132075472
AUC: 0.8717622455618551
