# Data Preprocessing

This notebook focuses on preprocessing the collected data, including cleaning, normalization, and transformation steps.

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load raw data
raw_data_path = '../data/raw/data.csv'
data = pd.read_csv(raw_data_path)

# Display the first few rows of the dataset
data.head()

In [None]:
# Data Cleaning Function
def clean_text(text):
    text = re.sub(r'http\S+|www\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9 ]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply cleaning function to the text column
data['cleaned_text'] = data['text'].apply(clean_text)

# Display cleaned data
data[['text', 'cleaned_text']].head()

In [None]:
# Normalize labels
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], test_size=0.2, random_state=42)

# Save processed data
processed_data_path = '../data/processed/processed_data.csv'
processed_data = pd.DataFrame({'text': X_train, 'label': y_train})
processed_data.to_csv(processed_data_path, index=False)

# Display the shape of the training and testing sets
print(f'Training set shape: {X_train.shape}, Testing set shape: {X_test.shape}')