In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import re

# Load the training data
train_data = pd.read_csv('../data/raw/bugs-train.csv')

# Define a function to clean text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)  # remove short words
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text.strip()

# Apply text cleaning and fill NaN values
train_data['summary_clean'] = train_data['summary'].fillna('').apply(clean_text)

# Split the data into training and validation sets
train_data_split, val_data_split = train_test_split(train_data, test_size=0.2, random_state=42)

# Convert text data to numerical format using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_data_split['summary_clean'])
X_val = vectorizer.transform(val_data_split['summary_clean'])

# Save the processed data
pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out()).to_csv('../data/processed/train_preprocessed.csv', index=False)
pd.DataFrame(X_val.toarray(), columns=vectorizer.get_feature_names_out()).to_csv('../data/processed/val_preprocessed.csv', index=False)

# Save the cleaned summary columns for future use
train_data_split[['summary_clean', 'severity']].to_csv('../data/processed/train_cleaned.csv', index=False)
val_data_split[['summary_clean', 'severity']].to_csv('../data/processed/val_cleaned.csv', index=False)
