# Data Cleaning and Preprocessing

In [4]:
import pandas as pd
import re

# Load the dataset
file_path = "Combined Data.csv"  # Ensure the file is in the same directory as this notebook
df = pd.read_csv(file_path)

# Drop the unnecessary index column (if exists)
if "Unnamed: 0" in df.columns:
    df = df.drop(columns=["Unnamed: 0"])

# Remove rows where 'statement' is missing
df = df.dropna(subset=["statement"])

# Remove duplicate entries
df = df.drop_duplicates()

# Convert text to lowercase
df["statement"] = df["statement"].str.lower()

# Function to clean text: Remove special characters, numbers, and extra spaces
def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Keep only letters and spaces
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

df["statement"] = df["statement"].apply(clean_text)

# Save the cleaned dataset for further use
cleaned_file_path = "cleaned_data.csv"
df.to_csv(cleaned_file_path, index=False)

# Print confirmation message
print(f"Data cleaning complete! Cleaned dataset saved successfully as '{cleaned_file_path}'.")
      
# Display first few rows
df.head()

Data cleaning complete! Cleaned dataset saved successfully as 'cleaned_data.csv'.


Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,trouble sleeping confused mind restless heart ...,Anxiety
2,all wrong back off dear forward doubt stay in ...,Anxiety
3,ive shifted my focus to something else but im ...,Anxiety
4,im restless and restless its been a month now ...,Anxiety
