In [1]:
import re
from nltk.stem import WordNetLemmatizer
import pandas as pd

# Load the dataset and list the columns
file_path = './Tweets.csv'
# Load only the relevant columns for the analysis
relevant_columns = ['airline', 'airline_sentiment', 'negativereason', 'text']
tweets_data = pd.read_csv(file_path, usecols=relevant_columns)
# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Define contractions for expansion
contractions = {"don't": "do not", "can't": "cannot", "i'm": "i am"}

# Function for data cleaning
def clean_text(text):
    # 1. Remove mentions
    text = re.sub(r"@\w+", "", text)
    # 2. Remove URLs
    text = re.sub(r"http[s]?://\S+", "", text)
    # 3. Remove email addresses
    text = re.sub(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b", "", text)
    # 4. Remove currency values
    text = re.sub(r"\$\d+(?:\.\d{2})?", "", text)
    # 5. Remove emojis
    text = re.sub(r"[^\w\s,]", "", text, flags=re.UNICODE)
    # 6. Remove HTML escaped characters
    text = re.sub(r"&[a-z]+;", "", text)
    # 7. Normalize times & dates
    text = re.sub(r"\b\d{1,2}[:/]\d{1,2}(?:\s?[APMapm]+)?\b", "", text)
    # 8. Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)
    # 9. Remove Chinese characters
    text = re.sub(r"[\u4e00-\u9fff]", "", text)
    # 10. Expand contractions
    text = " ".join([contractions[word] if word in contractions else word for word in text.split()])
    # 11. Remove consecutive repeating characters
    text = re.sub(r"(.)\1{2,}", r"\1", text)
    # 12. Remove words that are too short or too long
    text = " ".join([word for word in text.split() if 2 <= len(word) <= 15])
    # 13. Apply lemmatization
    text = " ".join([lemmatizer.lemmatize(word) for word in text.split()])
    # 14. Remove extra whitespaces
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Clean the dataset
tweets_data['cleaned_text'] = tweets_data['text'].apply(clean_text)

# Remove duplicates and empty rows
tweets_data = tweets_data.drop_duplicates(subset=['cleaned_text', 'airline_sentiment'])
tweets_data = tweets_data[tweets_data['cleaned_text'] != ""]

# Save the cleaned dataset
cleaned_file_path = './cleaned_tweets.csv'
tweets_data.to_csv(cleaned_file_path, index=False)

print(f"Cleaned data saved to: {cleaned_file_path}")


Cleaned data saved to: ./cleaned_tweets.csv
