In [1]:
import re
import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, roc_curve, auc
import matplotlib.pyplot as plt
from scipy.sparse import hstack
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

In [3]:
# Function definitions for cleaning and processing tweets
def clean_text(text):
    """Clean the input text by removing unwanted characters."""
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Remove @ mentions
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove links
    text = re.sub(r'\#', '', text)  # Remove hashtags
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

In [4]:
def get_sentiment(content):
    """Calculates sentiment polarity of the tweet."""
    return TextBlob(content).sentiment.polarity

In [5]:
def lemmatize_text(content):
    """Lemmatize the text to reduce words to their base form."""
    lemmatizer = WordNetLemmatizer()
    lemmatized_content = re.sub('[^a-zA-Z]', ' ', content).lower().split()
    lemmatized_content = [lemmatizer.lemmatize(word) for word in lemmatized_content if word not in stopwords.words('english')]
    return ' '.join(lemmatized_content)

In [6]:
def load_data(file_path):
    """Load the dataset from a CSV file."""
    return pd.read_csv(file_path).fillna(' ')

In [7]:
def print_metrics(y_true, y_pred, dataset_name):
    """Print evaluation metrics for the model."""
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"Metrics for {dataset_name} Set:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}\n")

In [None]:
def main():
    # Load and preprocess data
    twitter_df = pd.read_csv('Twitterdataset.csv')
    twitter_df['cleaned_tweet'] = twitter_df['tweet'].apply(clean_text)
    twitter_df['sentiment'] = twitter_df['cleaned_tweet'].apply(get_sentiment)
    twitter_df['processed_content'] = twitter_df['cleaned_tweet'].apply(lemmatize_text)
    twitter_df['tweet_length'] = twitter_df['cleaned_tweet'].apply(lambda x: len(x.split()))

    # Prepare features and target variable
    X = twitter_df[['processed_content', 'sentiment', 'tweet_length']]
    y = twitter_df['BinaryNumTarget']

    # Feature extraction
    vector = TfidfVectorizer(max_features=5000)
    X_text = vector.fit_transform(X['processed_content'].fillna(' '))
    X_final = hstack((X_text, X[['sentiment', 'tweet_length']].values))

    # Scale the features
    scaler = StandardScaler(with_mean=False)
    X_scaled = scaler.fit_transform(X_final)

    # Train-test split
    X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=42)

    # Hyperparameter tuning with GridSearchCV
    param_grid = {
        'C': [0.1, 1, 10, 100],
        'solver': ['liblinear'],  # Focus on solvers that typically converge better
        'max_iter': [1000, 5000]  # Increase max_iter to allow for more iterations
    }

    grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, Y_train)

    # Best parameters
    print("Best parameters found: ", grid_search.best_params_)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predictions and metrics for training and test sets
    logistic_train_pred = best_model.predict(X_train)
    logistic_test_pred = best_model.predict(X_test)

    print_metrics(Y_train, logistic_train_pred, "Training")
    print_metrics(Y_test, logistic_test_pred, "Test")

    # Confusion Matrix for Training Set
    cm_train = confusion_matrix(Y_train, logistic_train_pred)
    disp_train = ConfusionMatrixDisplay(confusion_matrix=cm_train)
    disp_train.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix - Training Set")
    plt.show()

    # Confusion Matrix for Test Set
    cm_test = confusion_matrix(Y_test, logistic_test_pred)
    disp_test = ConfusionMatrixDisplay(confusion_matrix=cm_test)
    disp_test.plot(cmap=plt.cm.Blues)
    plt.title("Confusion Matrix - Test Set")
    plt.show()

    # ROC Curve for Training Set
    logistic_train_prob = best_model.predict_proba(X_train)[:, 1]
    fpr_train, tpr_train, thresholds_train = roc_curve(Y_train, logistic_train_prob)
    roc_auc_train = auc(fpr_train, tpr_train)

    plt.figure(figsize=(8, 5))
    plt.plot(fpr_train, tpr_train, color='green', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_train))
    plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve - Training Set')
    plt.legend(loc='lower right')
    plt.show()

    # ROC Curve for Test Set
    logistic_test_prob = best_model.predict_proba(X_test)[:, 1]
    fpr_test, tpr_test, thresholds_test = roc_curve(Y_test, logistic_test_prob)
    roc_auc_test = auc(fpr_test, tpr_test)

    plt.figure(figsize=(8, 5))
    plt.plot(fpr_test, tpr_test, color='blue', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc_test))
    plt.plot([0, 1], [0, 1], color='red', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve - Test Set')
    plt.legend(loc='lower right')
    plt.show()

    # Save the best model, vectorizer, and scaler
    joblib.dump(best_model, "Best_Logistic_Regression_Model.pkl")
    joblib.dump(vector, "Vectorizer.pkl")
    joblib.dump(scaler, "Scaler.pkl")
    print("Best model, vectorizer, and scaler are saved.")

if __name__ == "__main__":
    main()