In [12]:
# https://medium.com/@sacbis9/itsm-open-active-incidents-closing-prediction-incident-management-bdf1684d84dc

In [13]:
# basic libs
import re
from string import punctuation

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup

# sklearn
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# NLP
import nltk
import spacy
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib  # or import pickle

# sentence-transformers
from sentence_transformers import SentenceTransformer

# local exception handler (keep if you have this module)
# from src.exception_handler import handle_exception

# load models / resources
# NOTE: make sure the spacy model 'en_core_web_sm' is installed in the environment.
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])  # faster if parser+ner unused
en = spacy.load("en_core_web_sm")  # if you need full pipeline elsewhere

# sentence-transformer embedder (device can be "cpu" or "cuda")
embedder = SentenceTransformer('all-MiniLM-L6-v2', device="cpu")


In [14]:
# NLTK resources (run once)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
# optionally:
# nltk.download('wordnet')
# nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Akash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
train_data_path = "./data/train_sentiment_tweet.csv"
test_data_path = "./data/test_sentiment_tweet.csv"
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
# normalize column names
for i in [train_data, test_data]:
    i.columns = i.columns.str.lower().str.replace(" ", "_")

    

# ✅ Drop duplicates
print("Train data before dropping duplicates:", train_data.shape)
print("Test data before dropping duplicates:", test_data.shape)
train_data = train_data.drop_duplicates(keep="first")
test_data = test_data.drop_duplicates(keep="first")
print("Train data after dropping duplicates:", train_data.shape)
print("Test data after dropping duplicates:", test_data.shape)

# ✅ Reset index
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)



Train data before dropping duplicates: (10000, 2)
Test data before dropping duplicates: (4913, 2)
Train data after dropping duplicates: (9591, 2)
Test data after dropping duplicates: (4809, 2)


In [16]:
import re
import emoji
from bs4 import BeautifulSoup


def clean_html_data_in_string(input_string: str) -> str:
    """
    Cleans HTML tags from the input string.

    Args:
        input_string (str): The string containing HTML data.

    Returns:
        str: The string with HTML content removed and only text retained.
    """
    soup = BeautifulSoup(input_string, "html.parser")
    return soup.get_text()


def remove_email_n_url(text: str) -> str:
    """
    Remove URLs, emails, and domain-like patterns from text.

    Args:
        text (str): Input text.

    Returns:
        str: Cleaned text without URLs/emails.
    """
    text = re.sub(
        r"https?://\S+|www\.\S+|\S+@\S+|[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}",
        "",
        text,
    )
    text = re.sub(r"\s+", " ", text).strip()
    return text


def give_emoji_free_text(text: str) -> str:
    """
    Remove emojis and emoticons from text.

    Args:
        text (str): Input text.

    Returns:
        str: Cleaned text without emojis/emoticons.
    """
    # Remove emojis using the emoji library
    text = emoji.replace_emoji(text, replace="")

    # Define regex for common emoticons
    emoticon_pattern = r"""
        (?:
          [<>]?
          [:;=8]                     # eyes
          [\-o\*\']?                  # optional nose
          [\)\]\(\[dDpP/\:\}\{@\|\\]  # mouth
        )
        |
        (?:
          [\)\]\(\[dDpP/\:\}\{@\|\\]  # mouth
          [\-o\*\']?                  # optional nose
          [:;=8]                      # eyes
          [<>]?
        )
    """
    text = re.sub(emoticon_pattern, "", text, flags=re.VERBOSE)
    text = re.sub(r"\s+", " ", text).strip()
    return text


def removing_punctuation(text: str) -> str:
    """
    Remove punctuation and underscores from the input text.

    Args:
        text (str): Input text.

    Returns:
        str: Cleaned text without punctuation/underscores.
    """
    # Remove punctuation (except underscore)
    text = re.sub(r"[^\w\s]", "", text)
    # Replace underscores with space
    text = text.replace("_", " ")
    # Clean up spaces
    text = re.sub(r"\s+", " ", text).strip()
    return text


def rem_numbers(text: str) -> str:
    """
    Remove all numeric digits from the input text.
    """
    return re.sub(r"\d+", "", text)


def removing_whitespaces(text: str) -> str:
    """
    Normalize multiple whitespaces into a single space.
    """
    return re.sub(r"\s+", " ", text).strip()


In [17]:

def tokenization(text: str, as_string: bool = False):
    """
    Tokenize the input text using Spacy.

    Args:
        text (str): The text to tokenize.
        as_string (bool): If True, returns tokens as a single string joined by spaces.
                          If False, returns a list of tokens.

    Returns:
        list[str] | str: Tokenized text.
    """
    doc = nlp(text)
    tokens = [token.text for token in doc]

    return " ".join(tokens) if as_string else tokens


def rem_stop_words(word_list):
    """
    Remove stop words from a list of words using Spacy's stop words list.

    Args:
        word_list (list[str]): List of words.

    Returns:
        str: Filtered words joined by spaces.
    """
    sw_list = nlp.Defaults.stop_words
    filtered_words = [word for word in word_list if word.lower() not in sw_list]
    return " ".join(filtered_words)


def lemmatization(text: str):
    """
    Lemmatizes the input text using Spacy.

    Args:
        text (str): The input text.

    Returns:
        list[str]: A list of lemmatized words.
    """
    doc = nlp(text)
    return [token.lemma_ for token in doc]


def remove_short_words(text: str):
    """
    Removes words with fewer than 3 characters.

    Args:
        text (str): The input text.

    Returns:
        str: Text with short words removed.
    """
    return " ".join(word for word in text.split() if len(word) > 2)


def sentence_transformation(corpus: list[str]) -> pd.DataFrame:
    """
    Encode sentences into embeddings.

    Args:
        corpus (list[str]): A list of sentences.

    Returns:
        pd.DataFrame: Sentence embeddings.
    """
    embeddings = embedder.encode(corpus)
    return pd.DataFrame(embeddings)


def preprocess_test_data(data: pd.DataFrame, required_columns: list[str]) -> pd.DataFrame:
    """
    Preprocess test data:
    - Merge required columns
    - Clean text
    - Tokenize, remove stopwords, lemmatize, etc.
    - Return sentence embeddings as DataFrame.

    Args:
        data (pd.DataFrame): Input data.
        required_columns (list[str]): Columns to merge into text.

    Returns:
        pd.DataFrame: Embeddings.
    """
    df = data.copy()

    # Merge columns into single text field
    try:
        df["text"] = df[required_columns].astype(str).apply(lambda x: " ".join(x), axis=1)
    except Exception as e:
        raise ValueError(f"Error while merging columns into text: {e}")

    try:
        df["text"] = df["text"].apply(clean_html_data_in_string)
        df["text"] = df["text"].apply(remove_email_n_url)
        df["text"] = df["text"].apply(give_emoji_free_text)
        df["text"] = df["text"].apply(removing_punctuation)
        df["text"] = df["text"].apply(rem_numbers)
        df["text"] = df["text"].apply(removing_whitespaces)
        df["text"] = df["text"].apply(tokenization)  # returns list
        df["text"] = df["text"].apply(rem_stop_words)
        df["text"] = df["text"].apply(lemmatization).apply(lambda x: " ".join(x))
        df["text"] = df["text"].apply(remove_short_words)
        df["text"] = df["text"].apply(str.strip).apply(str.lower)
    except Exception as e:
        raise ValueError(f"Error during preprocessing: {e}")

    # Generate embeddings
    try:
        bert_df = sentence_transformation(df["text"].fillna("").reset_index(drop=True).tolist())
    except Exception as e:
        raise ValueError(f"Error during sentence transformation: {e}")

    return bert_df


In [19]:
# %%
# 1. Preprocess the TRAINING data to get BERT embeddings (X_train)
required_columns = ['tweet']  # Replace with the actual column name(s) containing the text

# Preprocess training data
X_train = preprocess_test_data(train_data, required_columns)
# Extract the target variable from TRAINING data (replace 'label' with your actual column name)
y_train = train_data['label']

# 2. Preprocess the TESTING data to get BERT embeddings (X_test)
X_test = preprocess_test_data(test_data, required_columns)
# Extract the target variable from TESTING data (replace 'label' with your actual column name)
y_test = test_data['label']

# 3. Initialize and train the SVM model on TRAINING data
from sklearn.svm import SVC
svm_model = SVC(
    # random_state=42, 
   
    random_state=42,
     kernel='linear',

    # n_estimators=100,
    max_depth=None,        # Let trees grow deep
    min_samples_split=2,   # Minimum samples to split a node
    min_samples_leaf=1,    # Minimum samples at a leaf node
    class_weight='balanced'
                )  # Changed to SVC with linear kernel
svm_model.fit(X_train, y_train)

# 4. Make predictions on TEST data and evaluate
y_pred = svm_model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Model Accuracy: {accuracy:.4f}")

# Print a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# 5. (Optional) Save the trained model for later use
# joblib.dump(svm_model, 'svm_sentiment_model.pkl')
# To load it later: loaded_model = joblib.load('svm_sentiment_model.pkl')

TypeError: SVC.__init__() got an unexpected keyword argument 'max_depth'

In [None]:
# accuracy previously = 0.7105

In [None]:
# Create a DataFrame from your list
custom_test_data = pd.DataFrame({
    'tweet': ["this is the best post i have ever seen in my life this is fantastic",
              "this is the worst post i have ever seen in my life this is bad",
              "i am happy today because i got a new job",
              "i am sad today because i lost my job",
              "this movie is ok to watch once but not great",
              "the food was terrible and the service was worse",]
})

# Preprocess the custom test data
X_custom_test = preprocess_test_data(custom_test_data, required_columns)

# Make predictions on the custom test data
y_custom_pred = svm_model.predict(X_custom_test)
print("Predictions for custom sentences:", y_custom_pred)

# Optional: Print the sentences with their predictions
for sentence, prediction in zip(custom_test_data['tweet'], y_custom_pred):
    sentiment = "Happy" if prediction == 1 else "Sad"
    print(f"'{sentence}' -> {sentiment} ({prediction})")

Predictions for custom sentences: [ 1 -1  1 -1  1 -1]
'this is the best post i have ever seen in my life this is fantastic' -> Happy (1)
'this is the worst post i have ever seen in my life this is bad' -> Sad (-1)
'i am happy today because i got a new job' -> Happy (1)
'i am sad today because i lost my job' -> Sad (-1)
'this movie is ok to watch once but not great' -> Happy (1)
'the food was terrible and the service was worse' -> Sad (-1)
