In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Helper Function for Text Cleaning:

Implement a Helper Function as per Text Preprocessing Notebook and Complete the following pipeline.

# Build a Text Cleaning Pipeline

In [None]:
def text_cleaning_pipeline(dataset, rule = "lemmatize"):
  """
  This...
  """
  # Convert the input to small/lower order.
  data =
  # Remove URLs
  data =
  # Remove emojis
  data =
  # Remove all other unwanted characters.
  data =
  # Create tokens.
  tokens = data.split()
  # Remove stopwords:
  tokens =
  if rule == "lemmatize":
    tokens =
  elif rule == "stem":
    tokens =
  else:
    print("Pick between lemmatize or stem")


  return " ".join(tokens)


# Text Classification using Machine Learning Models


### 📝 Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


In [None]:
# Install required libraries (run if not already installed)
!pip install pandas numpy scikit-learn nltk

# Import libraries
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import string
import joblib
import os

# Download NLTK resources with verification
def download_nltk_resource(resource_name):
    try:
        nltk.data.find(resource_name)
        print(f"{resource_name} is already downloaded.")
    except LookupError:
        print(f"Downloading {resource_name}...")
        success = nltk.download(resource_name, quiet=False)
        if success:
            print(f"{resource_name} downloaded successfully.")
        else:
            raise Exception(f"Failed to download {resource_name}. Please check your internet connection and try again.")

# List of required NLTK resources
# Changed 'averaged_perceptron_tagger' to 'averaged_perceptron_tagger_eng'
nltk_resources = ['punkt', 'punkt_tab', 'stopwords', 'wordnet', 'averaged_perceptron_tagger_eng']

# Download and verify each resource
for resource in nltk_resources:
    download_nltk_resource(resource)

# Verify NLTK data path
print("\nNLTK Data Paths:")
print(nltk.data.path)

Downloading punkt...
punkt downloaded successfully.
Downloading punkt_tab...
punkt_tab downloaded successfully.
Downloading stopwords...
stopwords downloaded successfully.
Downloading wordnet...
wordnet downloaded successfully.
Downloading averaged_perceptron_tagger_eng...
averaged_perceptron_tagger_eng downloaded successfully.

NLTK Data Paths:
['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


In [None]:
# Define helper function for POS tagging
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return 'a'  # Adjective
    elif treebank_tag.startswith('V'):
        return 'v'  # Verb
    elif treebank_tag.startswith('N'):
        return 'n'  # Noun
    elif treebank_tag.startswith('R'):
        return 'r'  # Adverb
    else:
        return 'n'  # Default to noun

# Define the text cleaning pipeline
def text_cleaning_pipeline(dataset, rule="lemmatize"):
    """
    Cleans text data by applying preprocessing steps:
    - Lowercasing
    - Removing URLs, emojis, mentions, hashtags, and special characters
    - Tokenizing
    - POS tagging
    - Removing stopwords
    - Applying lemmatization (with POS tagging) or stemming

    Args:
        dataset (str): Input text to be cleaned
        rule (str): Either 'lemmatize' or 'stem' for normalization

    Returns:
        str: Cleaned and processed text
    """
    # Convert to lowercase
    data = dataset.lower()

    # Remove URLs
    data = re.sub(r'http\S+|www\S+|https\S+', '', data, flags=re.MULTILINE)

    # Remove emojis
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE
    )
    data = emoji_pattern.sub(r'', data)

    # Remove mentions, hashtags, and special characters
    data = re.sub(r'@\w+|#\w+|[^a-zA-Z\s]', '', data)

    # Tokenize
    tokens = word_tokenize(data)

    # Get POS tags before stopword removal
    pos_tags = nltk.pos_tag(tokens)

    # Remove stopwords, preserving POS tags
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [(token, pos) for token, pos in pos_tags if token not in stop_words]

    # Apply lemmatization or stemming
    if rule == "lemmatize":
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(token, pos=get_wordnet_pos(pos)) for token, pos in filtered_tokens]
    elif rule == "stem":
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token, _ in filtered_tokens]
    else:
        print("Pick between lemmatize or stem")
        return ""

    # Join tokens back into a string
    return " ".join(tokens)

# Test the function with debugging
sample_text = "I am loving this policy! https://example.com @user #Trump2024 😊"
# Debug: Print tokens and POS tags
tokens = word_tokenize(sample_text.lower())
pos_tags = nltk.pos_tag(tokens)
print(f"Tokens and POS Tags: {pos_tags}")
cleaned_text = text_cleaning_pipeline(sample_text, rule="lemmatize")
print(f"Original: {sample_text}")
print(f"Cleaned: {cleaned_text}")

Tokens and POS Tags: [('i', 'NN'), ('am', 'VBP'), ('loving', 'VBG'), ('this', 'DT'), ('policy', 'NN'), ('!', '.'), ('https', 'NN'), (':', ':'), ('//example.com', 'NN'), ('@', 'NNP'), ('user', 'RB'), ('#', '#'), ('trump2024', 'NN'), ('😊', 'NN')]
Original: I am loving this policy! https://example.com @user #Trump2024 😊
Cleaned: love policy


In [None]:
# Load the dataset
dataset_path = '/content/trum_tweet_sentiment_analysis.csv'  # Update with your actual file path
df = pd.read_csv(dataset_path)

# Verify the dataset
print("Dataset Head:")
print(df.head())
print("\nColumns:", df.columns)

# Check for missing values
print("\nMissing Values:")
print(df.isnull().sum())

# Drop rows with missing text or label (if any)
# Assuming 'Sentiment' is the actual label column name
df = df.dropna(subset=['text', 'Sentiment']) # Changed 'label' to 'Sentiment'

# Check label distribution
# Assuming 'Sentiment' is the actual label column name
print("\nLabel Distribution:")
print(df['Sentiment'].value_counts()) # Changed 'label' to 'Sentiment'

Dataset Head:
                                                text  Sentiment
0  RT @JohnLeguizamo: #trump not draining swamp b...          0
1  ICYMI: Hackers Rig FM Radio Stations To Play A...          0
2  Trump protests: LGBTQ rally in New York https:...          1
3  "Hi I'm Piers Morgan. David Beckham is awful b...          0
4  RT @GlennFranco68: Tech Firm Suing BuzzFeed fo...          0

Columns: Index(['text', 'Sentiment'], dtype='object')

Missing Values:
text         0
Sentiment    0
dtype: int64

Label Distribution:
Sentiment
0    1244211
1     605912
Name: count, dtype: int64


In [None]:
# Apply text cleaning to the 'text' column
df['cleaned_text'] = df['text'].apply(lambda x: text_cleaning_pipeline(x, rule="lemmatize"))

# Verify the cleaned text
print("\nOriginal vs Cleaned Text:")
print(df[['text', 'cleaned_text']].head())

In [None]:
# Define features (X) and labels (y)
X = df['cleaned_text']
y = df['Sentiment']  # Changed 'label' to 'Sentiment'

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Verify the split
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")


Training set size: 1480098
Testing set size: 370025


In [None]:
# Initialize the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Verify the shape of the transformed data
print(f"Training TF-IDF shape: {X_train_tfidf.shape}")
print(f"Testing TF-IDF shape: {X_test_tfidf.shape}")

Training TF-IDF shape: (1480098, 5000)
Testing TF-IDF shape: (370025, 5000)


In [None]:
# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)

# Train the model
model.fit(X_train_tfidf, y_train)

In [None]:
# Make predictions on the test set
y_pred = model.predict(X_test_tfidf)

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94    248563
           1       0.90      0.86      0.88    121462

    accuracy                           0.92    370025
   macro avg       0.92      0.91      0.91    370025
weighted avg       0.92      0.92      0.92    370025



In [None]:
# Save the model
joblib.dump(model, '/content/drive/MyDrive/logistic_regression_model.pkl')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, '/content/drive/MyDrive/tfidf_vectorizer.pkl')

['/content/drive/MyDrive/tfidf_vectorizer.pkl']

In [None]:
# Test on a sample tweet
sample_tweet = "I love this amazing policy! #Trump2024 😊"
cleaned_tweet = text_cleaning_pipeline(sample_tweet, rule="lemmatize")
tfidf_tweet = tfidf_vectorizer.transform([cleaned_tweet])
prediction = model.predict(tfidf_tweet)

# Print results
print(f"Sample Tweet: {sample_tweet}")
print(f"Cleaned Tweet: {cleaned_tweet}")
print(f"Predicted Sentiment: {prediction[0]}")

Sample Tweet: I love this amazing policy! #Trump2024 😊
Cleaned Tweet: love amazing policy
Predicted Sentiment: 1
