In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
# Load the cleaned dataset
file_path = './cleaned_tweets.csv'
tweets_data = pd.read_csv(file_path)

# Split the data into training and testing sets
train_data, test_data, train_labels, test_labels = train_test_split(
    tweets_data['cleaned_text'], 
    tweets_data['airline_sentiment'], 
    test_size=0.1, 
    random_state=3, 
    stratify=tweets_data['airline_sentiment']  # Ensure class distribution is preserved
)

# Print class distribution
print("Training set class distribution:")
print(train_labels.value_counts())
print("\nTest set class distribution:")
print(test_labels.value_counts())


# Initialize TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train = tfidf.fit_transform(train_data)
X_test = tfidf.transform(test_data)


# Initialize the model
svm_model = SGDClassifier(
    loss="hinge",
    penalty="l2",
    alpha=1e-4,
    max_iter=100,
    tol=None,
    shuffle=True,
    random_state=3
)

# Train the model
svm_model.fit(X_train, train_labels)


# Perform 10-fold cross-validation
cv_scores = cross_val_score(svm_model, X_train, train_labels, cv=10, scoring="accuracy")

# Calculate mean validation accuracy
validation_accuracy = cv_scores.mean()
print(f"10-Fold Cross-Validation Accuracy: {validation_accuracy:.4f}")
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Predict on test data
test_predictions = svm_model.predict(X_test)

# Calculate accuracy
test_accuracy = accuracy_score(test_labels, test_predictions)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(test_labels, test_predictions))

# Print confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))


Training set class distribution:
negative    8170
neutral     2663
positive    1995
Name: airline_sentiment, dtype: int64

Test set class distribution:
negative    908
neutral     296
positive    222
Name: airline_sentiment, dtype: int64
10-Fold Cross-Validation Accuracy: 0.8009
Test Set Accuracy: 0.8114

Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.94      0.89       908
     neutral       0.68      0.51      0.58       296
    positive       0.81      0.69      0.75       222

    accuracy                           0.81      1426
   macro avg       0.78      0.71      0.74      1426
weighted avg       0.80      0.81      0.80      1426


Confusion Matrix:
[[851  44  13]
 [120 152  24]
 [ 40  28 154]]


# Optuna Find the Best Combination 

In [None]:
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import re
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Initialize lemmatizer and contractions
lemmatizer = WordNetLemmatizer()
contractions = {"don't": "do not", "can't": "cannot", "i'm": "i am"}

# Define a cleaning function with toggleable options
def clean_text_variant(
    text, remove_mentions=True, remove_urls=True, lemmatize=True, expand_contractions=True
):
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "", text)  # Remove emails
    text = re.sub(r"\$\d+(?:\.\d{2})?", "", text)  # Remove currency
    text = re.sub(r"[^\w\s,]", "", text, flags=re.UNICODE)  # Remove emojis
    text = re.sub(r"&[a-z]+;", "", text)  # Remove HTML escaped chars
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"[\u4e00-\u9fff]", "", text)  # Remove Chinese characters
    if remove_mentions:
        text = re.sub(r"@\w+", "", text)  # Remove mentions
    if remove_urls:
        text = re.sub(r"http[s]?://\S+", "", text)  # Remove URLs
    if expand_contractions:
        text = " ".join([contractions[word] if word in contractions else word for word in text.split()])
    text = re.sub(r"(.)\1{2,}", r"\1", text)  # Remove repeated characters
    text = " ".join([word for word in text.split() if 2 <= len(word) <= 15])  # Limit word length
    if lemmatize:
        text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatize
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Load dataset
file_path = './cleaned_tweets.csv'
tweets_data = pd.read_csv(file_path)

# Store detailed results for all combinations
detailed_results = []

# Define the optimization objective
def objective(trial):
    # Suggest cleaning steps
    remove_mentions = trial.suggest_categorical("remove_mentions", [True, False])
    remove_urls = trial.suggest_categorical("remove_urls", [True, False])
    lemmatize = trial.suggest_categorical("lemmatize", [True, False])
    expand_contractions = trial.suggest_categorical("expand_contractions", [True, False])
    
    # Apply cleaning with the suggested configuration
    tweets_data['variant_cleaned_text'] = tweets_data['text'].apply(
        lambda x: clean_text_variant(
            x,
            remove_mentions=remove_mentions,
            remove_urls=remove_urls,
            lemmatize=lemmatize,
            expand_contractions=expand_contractions,
        )
    )
    
    # TF-IDF vectorization
    tfidf = TfidfVectorizer(max_features=5000)
    X = tfidf.fit_transform(tweets_data['variant_cleaned_text'])
    y = tweets_data['airline_sentiment']
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=3, stratify=y
    )
    
    # Train a model
    svm_model = SGDClassifier(
        loss="hinge", penalty="l2", alpha=1e-4, max_iter=100, tol=None, shuffle=True, random_state=3
    )
    svm_model.fit(X_train, y_train)
    
    # Perform 10-fold cross-validation
    cv_scores = cross_val_score(svm_model, X_train, y_train, cv=10, scoring="accuracy")
    mean_accuracy = cv_scores.mean()
    
    # Evaluate on the test set
    y_pred = svm_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)
    confusion = confusion_matrix(y_test, y_pred)
    
    # Store detailed results
    detailed_results.append({
        "trial_params": trial.params,
        "cross_val_accuracy": mean_accuracy,
        "test_accuracy": test_accuracy,
        "classification_report": report,
        "confusion_matrix": confusion,
    })
    
    return mean_accuracy

# Use Optuna to optimize
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=256, n_jobs=-1)

# Display the best results
best_trial = study.best_trial
best_params = best_trial.params

print("\nBest Cleaning Steps:")
print(best_params)
print(f"Best Cross-Validation Accuracy: {study.best_value:.4f}")

# Extract the best trial's detailed results
best_result = next(
    res for res in detailed_results if res["trial_params"] == best_params
)
print("\nBest Test Set Accuracy:")
print(best_result["test_accuracy"])
print("\nBest Classification Report:")
print(pd.DataFrame(best_result["classification_report"]).T)
print("\nBest Confusion Matrix:")
print(best_result["confusion_matrix"])


# Best Combination 

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer and contractions
lemmatizer = WordNetLemmatizer()
contractions = {"don't": "do not", "can't": "cannot", "i'm": "i am"}

# Define the cleaning function with the best parameters
def clean_text_variant(
    text, remove_mentions=False, remove_urls=True, lemmatize=True, expand_contractions=False
):
    if remove_mentions:
        text = re.sub(r"@\w+", "", text)  # Remove mentions
    if remove_urls:
        text = re.sub(r"http[s]?://\S+", "", text)  # Remove URLs
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "", text)  # Remove emails
    text = re.sub(r"\$\d+(?:\.\d{2})?", "", text)  # Remove currency
    text = re.sub(r"[^\w\s,]", "", text, flags=re.UNICODE)  # Remove emojis
    text = re.sub(r"&[a-z]+;", "", text)  # Remove HTML escaped chars
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = re.sub(r"[\u4e00-\u9fff]", "", text)  # Remove Chinese characters
    if expand_contractions:
        text = " ".join([contractions[word] if word in contractions else word for word in text.split()])
    text = re.sub(r"(.)\1{2,}", r"\1", text)  # Remove repeated characters
    text = " ".join([word for word in text.split() if 2 <= len(word) <= 15])  # Limit word length
    if lemmatize:
        text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])  # Lemmatize
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

# Load dataset
file_path = './cleaned_tweets.csv'
tweets_data = pd.read_csv(file_path)

# Apply the best cleaning steps
tweets_data['cleaned_text'] = tweets_data['text'].apply(clean_text_variant)

# TF-IDF vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(tweets_data['cleaned_text'])
y = tweets_data['airline_sentiment']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=3, stratify=y
)

# Train a model
svm_model = SGDClassifier(
    loss="hinge", penalty="l2", alpha=1e-4, max_iter=100, tol=None, shuffle=True, random_state=3
)
svm_model.fit(X_train, y_train)

# Evaluate on the test set
y_pred = svm_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, output_dict=False)
confusion = confusion_matrix(y_test, y_pred)

# Output results
print(f"Test Set Accuracy: {test_accuracy:.6f}\n")
print("Classification Report:")
print(report)
print("\nConfusion Matrix:")
print(confusion)


Test Set Accuracy: 0.809958

Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.94      0.89       908
     neutral       0.67      0.51      0.58       296
    positive       0.81      0.68      0.74       222

    accuracy                           0.81      1426
   macro avg       0.77      0.71      0.73      1426
weighted avg       0.80      0.81      0.80      1426


Confusion Matrix:
[[853  44  11]
 [119 152  25]
 [ 41  31 150]]
