# English Tweets Sentiment Classifier

> **Note:**  
> Code comments throughout this notebook include references to experiments conducted during the development process.  
> These experiments are discussed and analyzed in detail in the accompanying report.


* **Downloads and Unzips**

In [None]:
!pip install nltk
!pip install wordnet
!pip install negspacy
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/

* **Imports all libraries**

In [None]:
import pandas as pd
import numpy as np
import optuna

from sklearn.model_selection import cross_val_score
from sklearn.utils import resample


import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import spacy
from negspacy.negation import Negex
from spacy.tokens import Doc
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

from wordcloud import WordCloud
import re#Word Frequency

from collections import Counter
import string
# from sklearn.preprocessing import StandardScaler
from nltk.tokenize import TweetTokenizer
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import learning_curve
from html import unescape 

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

from sklearn.base import TransformerMixin, BaseEstimator


* **Loads the training, validation  and test datasets**

In [None]:
train_data = pd.read_csv('/kaggle/input/ai-2-deep-learning-for-nlp-homework-1/train_dataset.csv')
test_data = pd.read_csv('/kaggle/input/ai-2-deep-learning-for-nlp-homework-1/test_dataset.csv')
val_data = pd.read_csv('/kaggle/input/ai-2-deep-learning-for-nlp-homework-1/val_dataset.csv')

* **Performs EDA**

In [None]:
print("Train Data Head:\n", train_data.head())
print("Validation Data Head:\n", val_data.head())
print("Test Data Head:\n", test_data.head())

print("\nTrain Data Info:")
print(train_data.info())
print("\nValidation Data Info:")
print(val_data.info())
print("\nTest Data Info:")
print(test_data.info())

print("\nMissing Values in Train Data:")
print(train_data.isnull().sum())

if train_data['Text'].isnull().sum() > 0:
    print("\nFilling missing values in 'Text' column with 'No Text'")
    train_data['Text'].fillna("No Text", inplace=True)

# For the Label column, fill with the most frequent value (mode)
if train_data['Label'].isnull().sum() > 0:
    mode_value = train_data['Label'].mode()[0]
    print(f"\nFilling missing values in 'Label' column with the most frequent value: {mode_value}")
    train_data['Label'].fillna(mode_value, inplace=True)

print("\nMissing Values in Validation Data:")
print(val_data.isnull().sum())

if val_data['Text'].isnull().sum() > 0:
    print("\nFilling missing values in 'Text' column with 'No Text'")
    val_data['Text'].fillna("No Text", inplace=True)

# For the Label column, fill with the most frequent value (mode)
if val_data['Label'].isnull().sum() > 0:
    mode_value = val_data['Label'].mode()[0]
    print(f"\nFilling missing values in 'Label' column with the most frequent value: {mode_value}")
    val_data['Label'].fillna(mode_value, inplace=True)

print("\nMissing Values in Test Data:")
print(test_data.isnull().sum())

if test_data['Text'].isnull().sum() > 0:
    print("\nFilling missing values in 'Text' column with 'No Text'")
    test_data['Text'].fillna("No Text", inplace=True)


print("\n Unique words of train dataset:")
print(train_data.nunique())

print("\n Unique words of validation dataset:")
print(val_data.nunique())

print("\n Unique words of test dataset:")
print(test_data.nunique())

# Create DataFrame with statistics for Label in Training and Validation datasets
stats_df = pd.DataFrame({
    "Dataset": ["Training", "Validation"],
    "Count": [train_data['Label'].count(), val_data['Label'].count()],
    "Mean": [train_data['Label'].mean(), val_data['Label'].mean()],
    "Std": [train_data['Label'].std(), val_data['Label'].std()],
    "Min": [train_data['Label'].min(), val_data['Label'].min()],
    "Max": [train_data['Label'].max(), val_data['Label'].max()]
})

# Set figure size
plt.figure(figsize=(8, 4))

# Create heatmap for statistics visualization
sns.heatmap(stats_df.set_index("Dataset"), annot=True, cmap="Blues", fmt=".2f", linewidths=0.5)
# Add title
plt.title("Statistical Overview of Labels in Training and Validation Datasets")
# Display plot
plt.show()

# Analysis of Label column (distribution of categories)
print("\nLabel Distribution in Train Data:")
print(train_data['Label'].value_counts())
# Create plot for label distribution
sns.countplot(x='Label', data=train_data)
plt.title('Label Distribution in Train Data')
plt.show()


print("\nLabel Distribution in Validation Data:")
print(val_data['Label'].value_counts())
sns.countplot(x='Label', data=val_data)
plt.title('Label Distribution in Validation Data')
plt.show()

# Text Analysis / Word Cloud Generation
text = ' '.join(train_data['Text'].tolist())  # Combine all texts into one string
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud from Texts')
plt.show()

# Text cleaning and word splitting
words = ' '.join(train_data['Text']).lower()
words = re.findall(r'\b\w+\b', words)  # Extract words
word_freq = Counter(words).most_common(20)  # 20 most common words

# Convert to Pandas DataFrame
word_freq_df = pd.DataFrame(word_freq, columns=['Word', 'Frequency'])

# Bar plot creation
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Word', data=word_freq_df)
plt.title('20 Most Common Words')
plt.xlabel('Frequency')
plt.ylabel('Words')
plt.show()

# Step 7: Handle outliers (only for numerical columns)
if np.issubdtype(train_data['Label'].dtype, np.number):
    Q1 = train_data['Label'].quantile(0.25)
    Q3 = train_data['Label'].quantile(0.75)
    IQR = Q3 - Q1
    outliers = (train_data['Label'] < (Q1 - 1.5 * IQR)) | (train_data['Label'] > (Q3 + 1.5 * IQR))
    print("\nNumber of outliers in Label column:", outliers.sum())

    # Remove outliers (optional)
    train_data = train_data[~outliers]


# Text Length per Label (train dataset)
train_data['Text_Length'] = train_data['Text'].apply(len)  # Calculate text length
sns.boxplot(x='Label', y='Text_Length', data=train_data)
plt.title('Text Length per Label')
plt.show()

# 1. Bigram Analysis (2-word combinations)
vectorizer_bigrams = CountVectorizer(ngram_range=(2, 2))  # Create bigrams
bigrams = vectorizer_bigrams.fit_transform(train_data['Text'])
bigram_freq = zip(vectorizer_bigrams.get_feature_names_out(), bigrams.sum(axis=0).tolist()[0])
bigram_freq = sorted(bigram_freq, key=lambda x: x[1], reverse=True)[:20]  # Top 20 most frequent bigrams

# Convert to DataFrame
bigram_df = pd.DataFrame(bigram_freq, columns=['Bigram', 'Frequency'])

# Create a bar plot for bigrams
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Bigram', data=bigram_df)
plt.title('Top 20 Most Frequent Bigrams')
plt.xlabel('Frequency')
plt.ylabel('Bigrams')
plt.show()

# 2. Trigram Analysis (3-word combinations)
vectorizer_trigrams = CountVectorizer(ngram_range=(3, 3))  # Create trigrams
trigrams = vectorizer_trigrams.fit_transform(train_data['Text'])
trigram_freq = zip(vectorizer_trigrams.get_feature_names_out(), trigrams.sum(axis=0).tolist()[0])
trigram_freq = sorted(trigram_freq, key=lambda x: x[1], reverse=True)[:20]  # Top 20 most frequent trigrams

# Convert to DataFrame
trigram_df = pd.DataFrame(trigram_freq, columns=['Trigram', 'Frequency'])

# Create a bar plot for trigrams
plt.figure(figsize=(10, 6))
sns.barplot(x='Frequency', y='Trigram', data=trigram_df)
plt.title('Top 20 Most Frequent Trigrams')
plt.xlabel('Frequency')
plt.ylabel('Trigrams')
plt.show()

* **Defines text preprocessing function**

In [None]:

# lemmatizer = WordNetLemmatizer() # Initialize the stemmer
# stemmer = PorterStemmer()
# Initial stopwords list
# stop_words = set(stopwords.words("english"))
# Remove negations from stopwords list
# negation_words = {"no", "nor", "not", "don't", "doesn't", "didn't", "won't", "wouldn't", "isn't", "wasn't", "aren't", "weren't", "haven't", "hasn't", "hadn't", "can't", "couldn't", "shouldn't", "mustn't", "mightn't"}
# stop_words = stop_words - negation_words
# custom_stopwords = {"i", "to", "the", "a", "my", "and", "it", "you", "is", "for", "in", "s", "of", "t", "that", "on", "me", "so", "have", "m"}

nlp = spacy.load("en_core_web_sm")

NEGATION_WORDS = {"no", "not", "none", "neither", "nor", "never"}
NEGATION_PHRASES = {
    "not at all": "not_at_all",
    "not only": "not_only"
}

def handle_negation(text):
    """Process negation in text by marking negated words"""
    # Process phrases first
    for phrase, replacement in NEGATION_PHRASES.items():
        text = text.replace(phrase, replacement)
    
    words = word_tokenize(text.lower())
    processed_words = []
    negated = False
    
    for word in words:
        if word in NEGATION_WORDS:
            negated = True
            processed_words.append("[NEG]")  # Keep negation token
        elif negated:
            processed_words.append(f"not_{word}")
            negated = False
        else:
            processed_words.append(word)
    
    return " ".join(processed_words)

def replace_emoticons(text):
    """
    Replaces common emoticons with text tokens preserving sentiment.
    Improves accuracy by maintaining emotional context.
    """
    # Emoticon-to-text mapping dictionary
    EMOTICON_MAP = {
        # Positive
        ":‑)": " [happy] ", ":)": " [happy] ", ":-]": " [happy] ", 
        ":]": " [happy] ", ":-3": " [happy] ", ":3": " [happy] ",
        "=]": " [happy] ", "=)": " [happy] ", ":)": " [happy] ",
        "=D": " [happy]", '<3': ' [love] ', 
        
        # Negative
        "=(": " [sad] ", ":(": " [sad] ", ":-c": " [sad] ",
        ":-[": " [sad] ", ":[": " [sad] ", ":{": " [sad] ",
    }
    
    # Replace emoticons
    for emoticon, replacement in EMOTICON_MAP.items():
        text = text.replace(emoticon, replacement)
    
    # Handle repeated characters ("Heyyyy" -> "Heyy")
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    
    # Clean whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def replace_social_sentiment(text):
    """
    Replaces social media interjections with corresponding sentiment tokens.
    Returns original text if no matches found.
    """
    if not isinstance(text, str) or not text.strip():
        return text  # Return original for non-text
    
    SOCIAL_PATTERNS = {
        r"\bhahaha+\b": "happy",
        r"\bhehe+\b": "happy",
        r"\blol\b": "happy",
        r"\blmao\b": "happy",
        r"\bbruh+\b": "awkward",
        r"\boof\b": "awkward",
        r"\bwow\b": "surprise",
        r"\bomg\b": "surprise",
        r"\bugh\b": "annoyed",
        r"\bsmh\b": "disappointed"
    }

    modified = False
    new_text = text.lower()  # Work in lowercase for consistency

    for pattern, replacement in SOCIAL_PATTERNS.items():
        if re.search(pattern, new_text):
            new_text = re.sub(pattern, replacement, new_text)
            modified = True

    return new_text if modified else text

def remove_unicode(text):
    """Convert to ASCII and clean special characters"""
    text = text.encode('ascii', 'ignore').decode('ascii')
    # Additional cleaning
    text = re.sub(r"[^\w\s.,!?;]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def preprocess_text(text):
    """Main text preprocessing pipeline"""
    text = remove_unicode(text)
    text = text.lower()   
    text = replace_emoticons(text)
    text = replace_social_sentiment(text)
    # Remove mentions
    text = re.sub(r"@[\w\-_]+", "", text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Handle negation
    text = handle_negation(text)
    return text

    # text = text.translate(str.maketrans('', '', string.punctuation)) 
    
    # # Correct the spelling mistakes
    # text = re.sub(r"\b(luv)\b", "love", text)            
    # text = re.sub(r"\b(amzing)\b", "amazing", text)
    # text = re.sub(r"\b(terible)\b", "terrible", text)
    # text = re.sub(r"\b(excelent)\b", "excellent", text)
    # text = re.sub(r"\b(perfonmence)\b", "performance", text)
    # text = re.sub(r"\b(gud)\b", "good", text)
    # text = re.sub(r"\b(vry)\b", "very", text)
    # text = re.sub(r"\b(fanstic)\b", "fantastic", text)
    # text = re.sub(r"\b(gr8)\b", "great", text)
    # text = re.sub(r"\b(horrble)\b", "horrible", text)
    # text = re.sub(r"\b(u)\b", "you", text)
    # text = re.sub(r"\b(guyz)\b", "guys", text)
    # text = re.sub(r"\b(knw)\b", "know", text)
    # text = re.sub(r"\b(da)\b", "the", text)
    # text = re.sub(r"\b(btw)\b", "by the way", text)
    # text = re.sub(r"\b(r)\b", "are", text)
    # text = re.sub(r"\b(cuz)\b", "because", text)
    # text = re.sub(r"\b(tho)\b", "though", text)
    # text = re.sub(r"\b(lol)\b", "laugh out loud", text)
    # text = re.sub(r"\b(ur)\b", "your", text)


    
        # tokens = tweet_tokenizer.tokenize(text)  # Tokenization using TweetTokenizer

    # return " ".join(tokens)  # Return processed text as string

    # #Tokenization
    # words = word_tokenize(text)
    # return " ".join(words)

    # Remove custom stopwords or 
    # Remove stopwords but keep negations
    # words = [word for word in words if word not in stop_words]
    # processed_text = ' '.join(words)
    
    # words = [word for word in words if word not in custom_stopwords]

    # Lemmatization 
    # lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    # processed_text = ' '.join(lemmatized_words)
    # return processed_text

    # tagged_words = pos_tag(words)
    # # Keep only nouns (NN) and verbs (VB) //no
    # filtered_words = [word for word, tag in tagged_words if tag.startswith('NN') or tag.startswith('VB')]
    # # Rejoin words into text
    # processed_text = ' '.join(filtered_words)
    # return processed_text

    # # Stemming
    # stemmed_words = [stemmer.stem(word) for word in words] #probably not
    # # Join words back into a single string
    # processed_text = ' '.join(stemmed_words)
    # return processed_text


# def correct_spelling(text):
#     blob = TextBlob(text)
#     corrected_text = str(blob.correct())
#     return corrected_text


# Spelling correction
# train_data['Text'] = train_data['Text'].apply(correct_spelling)
# test_data['Text'] = test_data['Text'].apply(correct_spelling)
# val_data['Text'] = val_data['Text'].apply(correct_spelling)


* **Performs EDA after preprocessing**

In [None]:
def visualize_preprocessing_effects(original_texts, preprocessed_texts, sample_size=5):
    """
    Compares original and preprocessed texts through multiple visualizations.
    """
  
    # Sample comparison - Show raw vs processed text examples
    # This helps visually inspect what changes preprocessing introduced
    df_comparison = pd.DataFrame({
        'Original': original_texts[:sample_size],
        'Processed': preprocessed_texts[:sample_size]
    })
    print("\n--- Text samples before and after preprocessing ---")
    display(df_comparison)

    # Text length comparison - Boxplot of word counts
    # Shows if preprocessing significantly changes text length
    plt.figure(figsize=(10, 5))
    lengths = pd.DataFrame({
        'Original': [len(text.split()) for text in original_texts],
        'Processed': [len(text.split()) for text in preprocessed_texts]
    })
    sns.boxplot(data=lengths)
    plt.title('Text Length Comparison (in words)')
    plt.ylabel('Word Count')
    plt.show()
   
    # Processed text word cloud
    # Visualizes most prominent terms after preprocessing
    text_processed = ' '.join(preprocessed_texts)
    wordcloud = WordCloud(
        width=800, 
        height=400, 
        background_color='white',
        max_words=200  # Limits number of words shown for better readability
    ).generate(text_processed)
    
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Processed Text Word Cloud')
    plt.show()
    
    # Most frequent words analysis
    # Extracts words (alphanumeric only), converts to lowercase and counts frequencies
    words_processed = ' '.join(preprocessed_texts).lower()
    words_processed = re.findall(r'\b\w+\b', words_processed)  # \b = word boundaries
    word_freq_processed = Counter(words_processed).most_common(20)
    
    # Plot with proper pandas Series conversion to avoid warnings
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=[freq for word, freq in word_freq_processed], 
        y=pd.Series([word for word, freq in word_freq_processed])
    )
    plt.title('Top 20 Most Frequent Words (Processed)')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.show()
    
    # Bigram analysis (2-word sequences)
    # Important for understanding common phrases in the processed text
    vectorizer_bigrams = CountVectorizer(
        ngram_range=(2, 2),  # Only bigrams
        token_pattern=r'\b\w+\b',  # Same word tokenization as before
        min_df=2  # Ignore bigrams appearing only once
    )
    
    bigrams_processed = vectorizer_bigrams.fit_transform(preprocessed_texts)
    bigram_features = vectorizer_bigrams.get_feature_names_out()
    bigram_freqs = bigrams_processed.sum(axis=0).A1  # Convert to 1D array
    top_bigrams = sorted(zip(bigram_features, bigram_freqs), 
                       key=lambda x: x[1], 
                       reverse=True)[:20]
    
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=[freq for bigram, freq in top_bigrams], 
        y=pd.Series([bigram for bigram, freq in top_bigrams])
    )
    plt.title('Top 20 Most Frequent Bigrams (Processed)')
    plt.xlabel('Frequency')
    plt.ylabel('Bigrams')
    plt.tight_layout()  # Prevents label cutoff
    plt.show()

    # Trigram analysis (3-word sequences)
    # Reveals longer patterns in the processed text
    vectorizer_trigrams = CountVectorizer(
        ngram_range=(3, 3),  # Only trigrams
        token_pattern=r'\b\w+\b',
        min_df=2  # Minimum 2 occurrences
    )
    
    trigrams_processed = vectorizer_trigrams.fit_transform(preprocessed_texts)
    trigram_features = vectorizer_trigrams.get_feature_names_out()
    trigram_freqs = trigrams_processed.sum(axis=0).A1
    top_trigrams = sorted(zip(trigram_features, trigram_freqs),
                        key=lambda x: x[1],
                        reverse=True)[:20]
    
    plt.figure(figsize=(10, 6))
    sns.barplot(
        x=[freq for trigram, freq in top_trigrams],
        y=pd.Series([trigram for trigram, freq in top_trigrams])
    )
    plt.title('Top 20 Most Frequent Trigrams (Processed)')
    plt.xlabel('Frequency')
    plt.ylabel('Trigrams')
    plt.tight_layout()
    plt.show()


sample_texts = train_data['Text'].sample(1000, random_state=42).tolist()
processed_samples = [preprocess_text(text) for text in sample_texts]
visualize_preprocessing_effects(sample_texts, processed_samples)

* **Pipeline with TF-IDF and Logistic Regression for classification and evaluates model performance on validation test.**

In [None]:
text_preprocessor = FunctionTransformer(lambda docs: [preprocess_text(doc) for doc in docs])

best_model = Pipeline([
    ('preprocessor', text_preprocessor),
    ('tfidf', TfidfVectorizer(
        min_df=8,
        max_df=0.3,
        ngram_range=(1, 3),
        stop_words=None,
        sublinear_tf=True
    )),
    ('model', LogisticRegression(
        C=1.0,
        penalty='l2',
        solver='lbfgs',
        max_iter=300,
        multi_class='ovr',
        random_state=42
    ))
])

# Training and evaluation (Without GridSearch)
# Fit the model on training data
best_model.fit(train_data['Text'], train_data['Label'])

# Validation set evaluation
y_val_pred = best_model.predict(val_data['Text'])
print("Validation Metrics:")
print("------------------")
print(f"Accuracy: {accuracy_score(val_data['Label'], y_val_pred):.6f}")
print(f"Precision (Macro): {precision_score(val_data['Label'], y_val_pred, average='macro'):.4f}")
print(f"Recall (Macro): {recall_score(val_data['Label'], y_val_pred, average='macro'):.4f}")
print(f"F1-Score (Macro): {f1_score(val_data['Label'], y_val_pred, average='macro'):.4f}")

# Training set evaluation (to check for overfitting)
y_train_pred = best_model.predict(train_data['Text'])
print("Train Metrics:")
print("------------------")
print(f"Accuracy: {accuracy_score(train_data['Label'], y_train_pred):.6f}")


* **Experiments with grid search in comments below**

In [None]:
# # Set random seed for reproducibility
# np.random.seed(42)

# # Initialize text preprocessor
# text_preprocessor = FunctionTransformer(lambda docs: [preprocess_text(doc) for doc in docs])

# # Define main processing pipeline
# pipeline = Pipeline([
#     ('preprocessor', text_preprocessor),  # Custom text preprocessing
#     ('tfidf', TfidfVectorizer()),        # TF-IDF vectorization
#     ('model', LogisticRegression(random_state=42))  # Classification model
# ])


# # Parameters for Grid Search optimization
# param_grid = {
#     # TF-IDF parameters
#     'tfidf__min_df': [8],                # Minimum document frequency (ignore rare words)
#     'tfidf__max_df': [0.3],              # Maximum document frequency (ignore common words)
#     'tfidf__ngram_range': [(1,3)],       # Use unigrams, bigrams and trigrams
#     'tfidf__stop_words': [None],         # No additional stopword removal
#     'tfidf__sublinear_tf': [True],       # Apply sublinear TF scaling
    
#     # Logistic Regression parameters
#     'model__C': [0.4],                   # Inverse regularization strength
#     'model__penalty': ['l2'],            # L2 regularization
#     'model__solver': ['saga'],           # Optimization algorithm
#     'model__max_iter': [200],            # Maximum iterations
#     'model__multi_class': ['multinomial'],  # Multiclass strategy
#     'model__class_weight': [None]        # No class weighting
# }

# # Define evaluation metrics for Grid Search
# scoring = {
#     'accuracy': 'accuracy',
#     'precision': 'precision_macro',
#     'recall': 'recall_macro',
#     'f1': 'f1_macro'
# }

# # Configure Grid Search with 5-fold cross-validation
# grid_search = GridSearchCV(
#     pipeline, 
#     param_grid, 
#     cv=5,                  # 5-fold cross-validation or 3 cross-validation (faster)
#     scoring=scoring,       # Multiple evaluation metrics
#     refit='accuracy',      # Refit best model on accuracy
#     n_jobs=-1             # Use all available CPU cores
# )

# # Execute Grid Search
# grid_search.fit(train_data['Text'], train_data['Label'])

# # Retrieve and display results
# results_df = pd.DataFrame(grid_search.cv_results_)
# print(results_df[[
#     'mean_test_accuracy', 
#     'mean_test_precision', 
#     'mean_test_recall', 
#     'mean_test_f1'
# ]])

# # Get best performing model
# best_model = grid_search.best_estimator_

# # Display training information
# n_iterations = best_model.named_steps['model'].n_iter_
# print(f"Number of iterations completed: {n_iterations}")

# # Print best parameters and performance
# print(f"Best parameters: {grid_search.best_params_}")
# print("Best validation accuracy:", grid_search.best_score_)

* **Learning Curve**

In [None]:

# Transform the data using the already trained Vectorizer
X_transformed = best_model.named_steps['tfidf'].transform(train_data['Text'])

# Learning Curve ONLY for the trained model (without re-running tfidf)
train_sizes, train_scores, val_scores = learning_curve(
    best_model.named_steps['model'], X_transformed, train_data['Label'], cv=2, scoring="accuracy", 
    train_sizes=np.linspace(0.1, 1.0, 10)
)
# Calculate mean and standard deviation
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)

val_mean = np.mean(val_scores, axis=1)
val_std = np.std(val_scores, axis=1)

# Learning Curve plot
plt.plot(train_sizes, train_mean, 'o-', color="r", label="Training score")
plt.fill_between(train_sizes, train_mean - train_std, train_mean + train_std, alpha=0.1, color="r")

plt.plot(train_sizes, val_mean, 'o-', color="b", label="Cross-validation score")
plt.fill_between(train_sizes, val_mean - val_std, val_mean + val_std, alpha=0.1, color="b")

# Add final metrics to the plot
y_val_pred = best_model.predict(val_data['Text'])
final_accuracy = accuracy_score(val_data['Label'], y_val_pred)
plt.axhline(y=final_accuracy, color='green', linestyle='--', 
            label=f'Final Val Accuracy: {final_accuracy:.4f}')

plt.xlabel("Training examples")
plt.ylabel("Score")
plt.legend(loc="best")
plt.show()

* **ROC curve**

In [None]:

# ROC Curve Plot
plt.subplot(1, 2, 2)

# Get predicted probabilities for validation set
if hasattr(best_model.named_steps['model'], 'predict_proba'):
    y_probs = best_model.predict_proba(val_data['Text'])[:, 1]
else:  # For models without predict_proba (like SVM), use decision function
    y_scores = best_model.decision_function(val_data['Text'])
    y_probs = (y_scores - y_scores.min()) / (y_scores.max() - y_scores.min())

# Calculate ROC curve
fpr, tpr, thresholds = roc_curve(val_data['Label'], y_probs)
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.plot(fpr, tpr, color='darkorange', lw=2, 
         label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")

plt.tight_layout()
plt.show()

* **Generates submission file from test predictions and saves it as CSV**

In [None]:
test_predictions = best_model.predict(test_data['Text']) 
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'Label': test_predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission file created successfully!")
print(submission.head())