# Introduction
In any NLP task, cleaning or preprocessing the dataset is as important as model building.
some of the common preprocessing steps are given below with example.

In [3]:
import re
import pandas as pd
import nltk
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

STOP_WORDS = {'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 
    'he', 'him', 'his', 'she', 'her', 'it', 'its', 'they', 'them', 'their', 
    'am', 'is', 'are', 'was', 'were', 'be', 'been', 'a', 'an', 'the', 'and', 
    'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 
    'for', 'with', 'about', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off'}

def preprocessing_pipeline(text):
    # Lowercasing
    text = text.lower()
    
    # Splitting the text into tokens
    tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
    tokens = [item.strip() for item in tokens if item.strip()]
    
    # Stop Word & Punctuation Removal
    clean_tokens = [t for t in tokens if t not in STOP_WORDS and t not in '.,:;?!"()\'-_']
    
    # Stemming
    stemmed_tokens = [stemmer.stem(t) for t in clean_tokens]
    
    return " ".join(stemmed_tokens)

df = pd.read_csv('customer_support_dataset.csv')
df['processed_text'] = df['text'].apply(preprocessing_pipeline)

# 3. Final Output
print("--- Preprocessing Complete ---")
print(df[['text', 'processed_text']].head())

# Download the proprocessed csv file
df.to_csv('preprocessed_customer_support.csv', index=False)

--- Preprocessing Complete ---
                           text        processed_text
0     I want to return my order     want return order
1     My charger is not working      charger not work
2     The delivery is very late    deliveri veri late
3  I received a damaged product  receiv damag product
4               I need a refund           need refund
