In [None]:
import pandas as pd
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import zipfile

In [None]:
# ========== Step 1: Load and Analyze Dataset ========== #

# File paths
train_path = "C:/Users/nesil.bor/Desktop/Folders/master/DI725/DI725-transformer-sentiment-analysis/data/raw/train.csv"
test_path = "C:/Users/nesil.bor/Desktop/Folders/master/DI725/DI725-transformer-sentiment-analysis/data/raw/test.csv"

# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Display dataset information
print("Train Dataset Info:")
print(train_df.info())

print("\nTest Dataset Info:")
print(test_df.info())

# Display first few rows
print("\nTrain Dataset Sample:")
print(train_df.head())


In [None]:
# ========== Step 2: Column Selection ========== #

# We only need 'customer_sentiment' and 'conversation'
train_cleaned = train_df[['customer_sentiment', 'conversation']].copy()
test_cleaned = test_df[['customer_sentiment', 'conversation']].copy()

print("\nColumns selected successfully!")


In [None]:
# ========== Step 3: Extract & Load NLTK Data ========== #

# Path where the uploaded NLTK zip file will be extracted
nltk_data_path = "/mnt/data/nltk_data"

# Extract the uploaded nltk_data.zip
nltk_zip_path = "/mnt/data/nltk_data.zip"  # Ensure you upload this file before running
if os.path.exists(nltk_zip_path):
    with zipfile.ZipFile(nltk_zip_path, "r") as zip_ref:
        zip_ref.extractall(nltk_data_path)
    print("NLTK data extracted successfully!")

# Set NLTK data path
nltk.data.path.append(nltk_data_path)

# Load stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
# ========== Step 4: Text Preprocessing Function ========== #

def preprocess_text(text):
    """Advanced preprocessing: lowercasing, removing punctuation, stopwords, and lemmatization."""
    # Lowercasing
    text = text.lower()
    # Tokenization
    tokens = word_tokenize(text)
    # Remove punctuation and stopwords, then lemmatize
    cleaned_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and word not in string.punctuation]
    # Join tokens back into a string
    return " ".join(cleaned_tokens)

# Apply preprocessing to conversation text
train_cleaned['processed_conversation'] = train_cleaned['conversation'].apply(preprocess_text)
test_cleaned['processed_conversation'] = test_cleaned['conversation'].apply(preprocess_text)

In [None]:
# ========== Step 5: Verify Cleaned Data ========== #

print("\nPreprocessed Data Sample:")
print(train_cleaned[['customer_sentiment', 'processed_conversation']].head())

# Save the cleaned datasets
train_cleaned.to_csv("C:/Users/nesil.bor/Desktop/Folders/master/DI725/DI725-transformer-sentiment-analysis/data/processed/train_cleaned.csv", index=False)
test_cleaned.to_csv("C:/Users/nesil.bor/Desktop/Folders/master/DI725/DI725-transformer-sentiment-analysis/data/processed/test_cleaned.csv", index=False)

print("\nPreprocessing complete! Cleaned datasets saved as train_cleaned.csv and test_cleaned.csv.")