In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# Load the training and testing data
train_data = pd.read_csv('../data/raw/bugs-train.csv')
test_data = pd.read_csv('../data/raw/bugs-test.csv')

# Define a function to clean text data
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\b\w{1,2}\b', '', text)  # remove short words
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuation
    text = re.sub(r'\s+', ' ', text)  # remove extra spaces
    return text.strip()

# Apply text cleaning and fill NaN values
train_data['summary_clean'] = train_data['summary'].fillna('').apply(clean_text)
test_data['summary_clean'] = test_data['summary'].fillna('').apply(clean_text)

# Convert text data to numerical format using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_data['summary_clean'])
X_test = vectorizer.transform(test_data['summary_clean'])

# Save the processed data
pd.DataFrame(X_train.toarray(), columns=vectorizer.get_feature_names_out()).to_csv('../data/processed/train_preprocessed.csv', index=False)
pd.DataFrame(X_test.toarray(), columns=vectorizer.get_feature_names_out()).to_csv('../data/processed/test_preprocessed.csv', index=False)

# Save the cleaned summary columns for future use
train_data[['summary_clean', 'severity']].to_csv('../data/processed/train_cleaned.csv', index=False)
test_data[['summary_clean']].to_csv('../data/processed/test_cleaned.csv', index=False)
