In [None]:
import pandas as pd
import re
from transformers import DistilBertTokenizer


In [None]:
raw_data=pd.read_csv('sampled20k.csv')
# Load the DistilBERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# Keep necessary columns including 'commit'
data = raw_data[['commit', 'author', 'date', 'repo', 'message']]

# Normalize text (lowercase, strip whitespace, and remove newlines)
data['message'] = data['message'].str.lower().str.strip().str.replace(r'\s+', ' ', regex=True)

# Remove newlines and replace them with spaces
data['message'] = data['message'].str.replace(r'\s*\n\s*', ' ', regex=True)

# Remove anything within [...] and <...>
data['message'] = data['message'].str.replace(r'\[.*?\]', '', regex=True)  # Remove content in square brackets
data['message'] = data['message'].str.replace(r'<.*?>', '', regex=True)   # Remove content in angle brackets

# Remove two words before <...>
data['message'] = data['message'].str.replace(r'\b\w+\s+\w+\s+<.*?>', '', regex=True)

import re

def clean_text(text):

    # Remove entire phrases around specific separators
    text = re.sub(r'\b\w+:\w+\b', '', text)  # word1:word2
    text = re.sub(r'\b\w+=\w+\b', '', text)  # word1=word2
    text = re.sub(r'\b\w+/\w+\b', '', text)  # word1/word2
    text = re.sub(r'\b\w+//\w+\b', '', text)  # word1//word2
    
    # Remove extra whitespace and trim
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text
data['message'] = data['message'].astype(str)
# Apply the cleaning function
data['message'] = data['message'].apply(clean_text)

# Remove email addresses
data['message'] = data['message'].str.replace(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b', '', regex=True)

# Remove all other URLs and UUID-like identifiers
data['message'] = data['message'].str.replace(
    r'\bhttps?://\S+|www\.\S+\b|git-svn-id:.*?\s|svn://\S+|git://\S+|url:\s*[a-f0-9\-]{36}\b',
    '',
    regex=True
)

# Remove metadata fields and anything following them
metadata_fields = ['reviewed-on:', 'commit-queue:', 'change-id:', 'commit-position:']
for field in metadata_fields:
    data['message'] = data['message'].str.replace(rf'{field}\s*.*', '', regex=True)

# Retain the word "bug" but remove anything after it
data['message'] = data['message'].str.replace(r'\bbug\b\s*=\s*\S+', 'bug', regex=True)

# Remove greater than (>) and less than (<) signs
data['message'] = data['message'].str.replace(r'[<>]', '', regex=True)

# Remove metadata like "signed-off-by," "reviewed-by," "acked-by"
data['message'] = data['message'].str.replace(r'\b(signed-off-by|reviewed-by|acked-by):\s+.*?(\s|$)', '', regex=True)

# Remove standalone alphanumeric strings (e.g., abc123, test456)
data['message'] = data['message'].str.replace(r'\b[A-Za-z]*\d+[A-Za-z]*\b', '', regex=True)

# Remove all numbers (standalone or embedded within words)
data['message'] = data['message'].str.replace(r'\d+', '', regex=True)

# Remove strings with more than two repeated letters
data['message'] = data['message'].str.replace(r'\b\w*(\w)\1{2,}\w*\b', '', regex=True)

# Remove repeated full stops or commas of three or more in a row
data['message'] = data['message'].str.replace(r'[.,]{2,}', '', regex=True)

# Remove all special characters except for ",", "!", "?", ".", and "'"
data['message'] = data['message'].str.replace(r"[^\w\s,!?\.']", '', regex=True)

# Normalize spaces (remove excess spaces left from deletions)
data['message'] = data['message'].str.replace(r'\s+', ' ', regex=True).str.strip()

# Function to count tokens using the tokenizer
def count_tokens(message):
    encoding = tokenizer(message, return_tensors='pt', truncation=False, padding=False)
    return encoding.input_ids.shape[1]
data = data[data['message'].apply(count_tokens) <= 512]  # Keep messages with 512 tokens or fewer

# Filter out very short or empty messages
data = data[data['message'].str.split().str.len() > 2]  # Keep messages with more than 2 words

# Save the preprocessed dataset with 'commit' column retained
output_file = "cleaned20k.csv"
data.to_csv(output_file, index=False)

print(f"Preprocessed dataset saved to {output_file}")
