#### Step 1: Import Libraries and Define Paths



In [56]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import StandardScaler
import os
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import contractions
import logging

# Setup for NLTK
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Set paths for raw and processed data
data_path = "../data/"
raw_data_file = os.path.join(data_path, "Twitter_Data.csv")

# Logging configuration
logging.basicConfig(filename='data_preprocessing.log', level=logging.INFO)

# Define Emoji and Stopword Information
emoji_dict = {"😊": "happy", "😢": "sad", "❤️": "love"}
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Girija\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Girija\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Girija\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### Step 2: Analyze Dataset



In [57]:
def analyze_dataset(file_path):
    """
    Function to analyze the dataset structure to check for its size, columns, data types, 
    and missing values to get an overview of the data.
    """
    try:
        df = pd.read_csv(file_path)
        print("\n--- Dataset Analysis ---")
        print(f"Dataset Shape: {df.shape}")  # Display rows and columns
        print("\nColumns and Data Types:")
        print(df.dtypes)  # Check data types of each column
        print("\nFirst Few Rows:")
        print(df.head())  # Show a preview of the dataset
        print("\nMissing Values Count:")
        print(df.isnull().sum())  # Show number of missing values per column
        logging.info(f"Dataset loaded and analyzed: Shape - {df.shape}, Columns - {list(df.columns)}")
        return df
    except FileNotFoundError as e:
        logging.error(f"File not found: {file_path}, Error: {e}")
        return None

# Analyze the dataset
df_analysis = analyze_dataset(raw_data_file)
if df_analysis is None:
    exit()  # Exit if the dataset file is not found



--- Dataset Analysis ---
Dataset Shape: (162980, 2)

Columns and Data Types:
clean_text     object
category      float64
dtype: object

First Few Rows:
                                          clean_text  category
0  when modi promised “minimum government maximum...      -1.0
1  talk all the nonsense and continue all the dra...       0.0
2  what did just say vote for modi  welcome bjp t...       1.0
3  asking his supporters prefix chowkidar their n...       1.0
4  answer who among these the most powerful world...       1.0

Missing Values Count:
clean_text    4
category      7
dtype: int64


#### Step 3: Load Dataset

**Why**: Load the raw dataset into a DataFrame so it can be used for processing.



In [58]:
def load_dataset(file_path):
    """
    Function to load the dataset into a DataFrame. Logs errors if the dataset file is missing.
    """
    try:
        df = pd.read_csv(file_path)
        print(f"Dataset loaded successfully with shape: {df.shape}")
        logging.info(f"Dataset loaded successfully with shape: {df.shape}")
        return df
    except FileNotFoundError as e:
        logging.error(f"File not found: {file_path}, Error: {e}")
        return None

# Load the dataset
df = load_dataset(raw_data_file)
if df is None:
    exit()  # Exit if the dataset couldn't be loaded


Dataset loaded successfully with shape: (162980, 2)


#### Step 4: Handle Contractions and Preprocess Text Data

**Why**: Expand contractions to ensure uniformity in the text (e.g., "don't" → "do not"), which can improve tokenization and overall model performance.



In [59]:
def expand_contractions(text):
    """
    Function to expand contractions in the text for standardization.
    If the input is not a string (e.g., NaN), return an empty string.
    """
    if isinstance(text, str):
        return contractions.fix(text)
    else:
        return ""  # or you could return `text` to keep NaNs as they are

# Apply contraction expansion to the dataset's text column
df['clean_text'] = df['clean_text'].apply(expand_contractions)


#### Step 5: Text Normalization

**Why**: Normalize the text data by removing unnecessary characters, URLs, hashtags, mentions, and converting to lowercase for uniformity.



In [60]:
def normalize_text(text):
    """
    Function to normalize text by converting to lowercase, removing URLs, 
    mentions, hashtags, special characters, and expanding emojis.
    """
    # Convert to lowercase
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Replace emojis with words
    for emoji, word in emoji_dict.items():
        text = text.replace(emoji, word)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Apply text normalization
df['clean_text'] = df['clean_text'].apply(normalize_text)


#### Step 6: Tokenization

**Why**: Tokenize text into individual words to facilitate further text processing.



In [61]:
nltk.download('punkt_tab')
def tokenize_text(text):
    """
    Function to tokenize text into individual words (tokens).
    """
    return word_tokenize(text)

# Apply tokenization
df['tokens'] = df['clean_text'].apply(tokenize_text)


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Girija\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


#### Step 7: Remove Noise - Stop Words and Short Tokens

**Why**: Remove stop words (e.g., "is", "the") that don’t add significant meaning and remove short or meaningless tokens.



In [62]:
def remove_stopwords(tokens):
    """
    Function to remove stopwords and very short tokens that don't add significant meaning.
    """
    return [word for word in tokens if word not in stop_words and len(word) > 2]

# Apply stopword removal
df['tokens'] = df['tokens'].apply(remove_stopwords)
print(df)


                                               clean_text  category  \
0       when modi promised minimum government maximum ...      -1.0   
1       talk all the nonsense and continue all the dra...       0.0   
2       what did just say vote for modi welcome bjp to...       1.0   
3       asking his supporters prefix chowkidar their n...       1.0   
4       answer who among these the most powerful world...       1.0   
...                                                   ...       ...   
162975  why these crores paid neerav modi not recovere...      -1.0   
162976  dear rss terrorist payal gawar what about modi...      -1.0   
162977  did you cover her interaction forum where she ...       0.0   
162978  there big project came into india modi dream p...       0.0   
162979  have you ever listen about like gurukul where ...       1.0   

                                                   tokens  
0       [modi, promised, minimum, government, maximum,...  
1           [talk, nonsense

#### Step 8: Lemmatization

**Why**: Lemmatize words to reduce them to their base form (e.g., "running" → "run") for consistent representation.



In [63]:
def lemmatize_tokens(tokens):
    """
    Function to lemmatize tokens to get the base form of words.
    """
    return [lemmatizer.lemmatize(token) for token in tokens]

# Apply lemmatization
df['tokens'] = df['tokens'].apply(lemmatize_tokens)


#### Step 9: Feature Engineering

**Why**: Add new features to help the model understand text better, such as text length, word count, and sentiment scores.



In [64]:
from textblob import TextBlob

def get_sentiment_score(text):
    """
    Function to get sentiment score using TextBlob.
    """
    return TextBlob(text).sentiment.polarity

# Adding text length and word count as features
df['text_length'] = df['clean_text'].apply(len)
df['word_count'] = df['tokens'].apply(len)

# Adding sentiment score as a feature
df['sentiment_score'] = df['clean_text'].apply(get_sentiment_score)


#### Step 10: Validate and Balance Dataset

**Why**: Ensure the data is balanced. If there is a significant imbalance, apply methods to handle it, such as using SMOTE.

    - which is commonly used for handling imbalanced datasets, particularly through techniques like SMOTE (Synthetic Minority Oversampling Technique)
    - SMOTE creates synthetic examples of the minority class by interpolating between existing minority samples, which helps balance the dataset and improve model training.


- **Check for Label Balance**:
    - **Imbalanced Dataset**: Detect and balance classes to ensure the model doesn’t favor a dominant class.



In [65]:
from imblearn.over_sampling import SMOTE

def validate_and_handle_imbalance(df):
    """
    Function to validate label balance and handle class imbalance using SMOTE.
    """
    # Display unique labels in 'category'
    print("Unique categories:", df['category'].unique())
    label_counts = df['category'].value_counts()
    print("Label distribution:", label_counts)

    # Check for imbalance and apply SMOTE if necessary
    if label_counts.min() < 0.1 * label_counts.max():
        print("Warning: Significant imbalance detected. Applying SMOTE to handle imbalance.")
        smote = SMOTE(random_state=42)
        # Resample text and labels using TF-IDF vectorized form to maintain balance
        vectorizer_tfidf = TfidfVectorizer()
        X = vectorizer_tfidf.fit_transform(df['clean_text'])
        y = df['category']
        X_resampled, y_resampled = smote.fit_resample(X, y)
        return X_resampled, y_resampled, vectorizer_tfidf
    else:
        # Return original if no significant imbalance is found
        return None, None, None

# Apply label validation and handling imbalance if required
X_resampled, y_resampled, tfidf_vectorizer = validate_and_handle_imbalance(df)


Unique categories: [-1.  0.  1. nan]
Label distribution: category
 1.0    72250
 0.0    55213
-1.0    35510
Name: count, dtype: int64


#### Step 11: Split Dataset

**Why**: Divide the data into training, validation, and test sets to properly train and evaluate the model without overfitting.

- **Training Set**: Used to train the model.
- **Validation Set**: Used to tune the model and check its performance during training.
- **Test Set**: Used to evaluate the model's final performance.



In [66]:
# Step 1: Drop rows with NaN values in the target variable ('category')
df = df.dropna(subset=['category'])

# Step 2: Convert 'category' to integers if applicable
df.loc[:, 'category'] = df['category'].astype(int)

# Step 3: Split the Dataset
def split_dataset(df):
    """
    Function to split the dataset into training, validation, and test sets.
    Uses stratified sampling to ensure balanced distribution of categories in each set.
    """
    train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df['category'])
    train, val = train_test_split(train, test_size=0.2, random_state=42, stratify=train['category'])
    print(f"Training set size: {train.shape}")
    print(f"Validation set size: {val.shape}")
    print(f"Test set size: {test.shape}")
    return train, val, test

# Apply split function
train_df, val_df, test_df = split_dataset(df)


Training set size: (104302, 6)
Validation set size: (26076, 6)
Test set size: (32595, 6)


#### Step 12: Numerical Transformation

**Why**: Machine learning models need numerical inputs. We need to convert the cleaned text into numerical formats using different methods such as Bag of Words (BoW), TF-IDF, and Word Embeddings.

##### **Bag of Words (BoW) Representation**

- Converts text into a vector of word frequencies.
- **Training**: Fit on training data.
- **Validation and Test**: Transform using the same vectorizer.



In [71]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Bag of Words Representation
print("Generating Bag of Words (BoW) representations...\n")

# Step 1: Fit the Count Vectorizer
vectorizer_bow = CountVectorizer()
X_train_bow = vectorizer_bow.fit_transform(train_df['clean_text'])

# Step 2: Transform Validation and Test Sets
X_val_bow = vectorizer_bow.transform(val_df['clean_text'])
X_test_bow = vectorizer_bow.transform(test_df['clean_text'])

# Demonstration of CountVectorizer Output
print("Bag of Words Transformation Completed!\n")

# Print the number of features created (size of the vocabulary)
print(f"Number of features (vocabulary size): {len(vectorizer_bow.get_feature_names_out())}")

# Step 3: Display Sample Features and Their Count Scores
# Select the first sample from the training set for demonstration
sample_index = 0
sample_text = train_df['clean_text'].iloc[sample_index]
bow_counts = X_train_bow[sample_index]

# Convert BoW sparse matrix to a dense array and match with feature names
feature_names = vectorizer_bow.get_feature_names_out()
dense_bow = bow_counts.toarray().flatten()

# Create a DataFrame with words and their count scores
bow_data = pd.DataFrame({'Word': feature_names, 'Count': dense_bow})
top_bow_data = bow_data[bow_data['Count'] > 0].sort_values(by='Count', ascending=False).head(10)

# Step 4: Print out a demonstration of the Bag of Words features for the first sample
print("\nExample Text (Training Sample):")
print(sample_text)
print("\nTop 10 Words with Highest Counts in the Example Text:")
print(top_bow_data)


Generating Bag of Words (BoW) representations...

Bag of Words Transformation Completed!

Number of features (vocabulary size): 76876

Example Text (Training Sample):
galaxy bjp leaders led modi and party chief amit shah will hit the campaign trail telangana

Top 10 Words with Highest Counts in the Example Text:
           Word  Count
2591       amit      1
2808        and      1
8320        bjp      1
10399  campaign      1
12075     chief      1
25771    galaxy      1
30219       hit      1
38741   leaders      1
38885       led      1
43774      modi      1


##### **TF-IDF (Term Frequency-Inverse Document Frequency) Representation**

- Weights the importance of each word based on how often it appears across all documents.
- Helps in reducing the impact of common words that appear frequently but may not be informative.



In [70]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# TF-IDF Representation
print("Generating TF-IDF representations...\n")

# Step 1: Fit the TF-IDF Vectorizer
vectorizer_tfidf = TfidfVectorizer()
X_train_tfidf = vectorizer_tfidf.fit_transform(train_df['clean_text'])

# Step 2: Transform Validation and Test Sets
X_val_tfidf = vectorizer_tfidf.transform(val_df['clean_text'])
X_test_tfidf = vectorizer_tfidf.transform(test_df['clean_text'])

# Demonstration of TF-IDF Vectorizer Output
print("TF-IDF Transformation Completed!\n")

# Print the number of features created (size of the vocabulary)
print(f"Number of features (vocabulary size): {len(vectorizer_tfidf.get_feature_names_out())}")

# Step 3: Display Sample Features and Their TF-IDF Scores
# Select the first sample from the training set for demonstration
sample_index = 0
sample_text = train_df['clean_text'].iloc[sample_index]
tfidf_scores = X_train_tfidf[sample_index]

# Convert TF-IDF sparse matrix to a dense array and match with feature names
feature_names = vectorizer_tfidf.get_feature_names_out()
dense_tfidf = tfidf_scores.toarray().flatten()

# Create a DataFrame with words and their TF-IDF scores
tfidf_data = pd.DataFrame({'Word': feature_names, 'TF-IDF Score': dense_tfidf})
top_tfidf_data = tfidf_data[tfidf_data['TF-IDF Score'] > 0].sort_values(by='TF-IDF Score', ascending=False).head(10)

# Step 4: Print out a demonstration of the TF-IDF features for the first sample
print("\nExample Text (Training Sample):")
print(sample_text)
print("\nTop 10 Words with Highest TF-IDF Scores in the Example Text:")
print(top_tfidf_data)



Generating TF-IDF representations...

TF-IDF Transformation Completed!

Number of features (vocabulary size): 76876

Example Text (Training Sample):
galaxy bjp leaders led modi and party chief amit shah will hit the campaign trail telangana

Top 10 Words with Highest TF-IDF Scores in the Example Text:
            Word  TF-IDF Score
25771     galaxy      0.456295
70127      trail      0.387074
67973  telangana      0.317364
30219        hit      0.284731
38885        led      0.283387
2591        amit      0.251792
12075      chief      0.250234
38741    leaders      0.232948
62025       shah      0.228080
10399   campaign      0.226465


#### Step 13: Save Processed Data

**Why**: Save the cleaned and transformed datasets for reuse. Saving the numerical representations like BoW and TF-IDF helps speed up the process during model training.

##### **Save Cleaned and Split Datasets**



In [69]:
import joblib

def save_datasets(dataframes, filenames, data_path):
    """
    Function to save the processed datasets as CSV files for future use.
    """
    for df, filename in zip(dataframes, filenames):
        file_path = os.path.join(data_path, filename)
        df.to_csv(file_path, index=False)
        print(f"Saved {filename} to {data_path}")
        logging.info(f"Saved {filename} to {data_path}")

# Save datasets: Cleaned, Train, Validation, and Test
os.makedirs(data_path, exist_ok=True)
save_datasets(
    [df, train_df, val_df, test_df],
    ["processed_data.csv", "train_data.csv", "val_data.csv", "test_data.csv"],
    data_path
)


Saved processed_data.csv to ../data/
Saved train_data.csv to ../data/
Saved val_data.csv to ../data/
Saved test_data.csv to ../data/
