In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder

In [None]:
nltk.download('stopwords')

In [None]:
set(stopwords.words('english'))

In [None]:
def remove_stopwords(text: str) -> str:
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

In [None]:
def lemmatize(text: str) -> str:
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(words)

In [None]:
def clean_text(text: str) -> str:
    text = str(text).lower()
    text = re.sub(r'\W', ' ', text)            # Remove special characters
    text = re.sub(r'\s+', ' ', text)           # Remove extra spaces
    text = re.sub(r'\d', '', text)             # Remove numbers
    text = text.encode('ascii', 'ignore').decode('ascii')  # Remove emojis
    text = re.sub(r'http\S+', '', text)        # Remove URLs
    text = re.sub(r'#\w+', '', text)           # Remove hashtags
    text = re.sub(r'<.*?>', '', text)          # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)        # Remove punctuation

    #text = remove_stopwords(text)
    #text = lemmatize(text)

    return text

In [None]:
def process_file(input_path: str, output_path: str):

    df = pd.read_csv(input_path)

    df = df[df['sentiment'] != 'Irrelevant']
    df = df.drop(columns=['id', 'entity'])

    df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    df['post'] = df['post'].apply(clean_text)

    df = df[df['post'].str.strip().astype(bool)]
    df['post'] = df['post'].replace('nan', np.nan)
    df = df.dropna(subset=['post'])


    le = LabelEncoder()
    df['label'] = le.fit_transform(df['sentiment'])

    df.to_csv(output_path, index=False)


In [None]:
process_file(
    #your path
    input_path='',
    output_path=''
)