In [None]:
## Rotten Tomatoes Data

In [None]:
! pip install nltk

In [None]:
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import zipfile

In [None]:
# Download NLTK resources if not already present
import nltk
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)

In [None]:
path = "/repos/LAL/Public_Datasets/subjectivity/"

In [None]:
# Load the subjective and objective sentences using 'latin1' encoding
with open(path + "quote.tok.gt9.5000", "r", encoding="latin1") as f_subj:
    subjective = f_subj.readlines()

with open(path + "plot.tok.gt9.5000", "r", encoding="latin1") as f_obj:
    objective = f_obj.readlines()

In [None]:
# Create a DataFrame with labels
subj_df = pd.DataFrame({'text': subjective, 'target': 1})
obj_df = pd.DataFrame({'text': objective, 'target': 0})

In [None]:
# Combine datasets
data = pd.concat([subj_df, obj_df], ignore_index=True)

In [None]:
data.head(3)

In [None]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()

def preprocess(text):
    text = text.lower().translate(punct_table)
    tokens = tokenizer.tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

In [None]:
# Apply preprocessing
data["clean_text"] = data["text"].apply(preprocess)

In [None]:
# Vectorize using Binary Bag-of-Words
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(data["clean_text"])

In [None]:
# Convert to DataFrame and rename columns to f_1, f_2, ...
X_df = pd.DataFrame(X.toarray(), columns=[f"f_{i+1}" for i in range(X.shape[1])])

In [None]:
X_df.head(3)

In [None]:
pd.unique(X_df.values.ravel())

In [None]:
# Combine with target
final_df = pd.concat([X_df, data["target"].reset_index(drop=True)], axis=1)

In [None]:
final_df.head(3)

In [None]:
# If y is a column in a DataFrame (e.g., df["target"])
print(final_df["target"].value_counts(normalize=True))

In [None]:
# Split data by class
minority_df = final_df[final_df["target"] == 1]
majority_df = final_df[final_df["target"] == 0]

# Sample 20% of minority class
minority_sample = minority_df.sample(frac=0.2, random_state=42)

# Concatenate sampled minority and full majority
imbalanced_df = pd.concat([minority_sample, majority_df], ignore_index=True)

# Shuffle the resulting DataFrame
#imbalanced_df = imbalanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Optional: check new class balance
print(imbalanced_df["target"].value_counts(normalize=True))


In [None]:
imbalanced_df.head(3)

In [None]:
# Save as zipped CSV
imbalanced_df.to_csv(path + "Final_dataset_subjectivity.csv.zip", index=False, compression="zip")

print("✅ Preprocessed dataset saved as 'subjectivity_processed.csv.zip'")