In [None]:
## Rotten Tomatoes Data

In [None]:
! pip install nltk

In [1]:
import pandas as pd
import string
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import zipfile

In [2]:
# Download NLTK resources if not already present
import nltk
nltk.download('punkt', force=True)
nltk.download('stopwords', force=True)

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
path = "/repos/smote_msfb/public_datasets/subjectivity/"

In [4]:
# Load the subjective and objective sentences using 'latin1' encoding
with open(path + "quote.tok.gt9.5000", "r", encoding="latin1") as f_subj:
    subjective = f_subj.readlines()

with open(path + "plot.tok.gt9.5000", "r", encoding="latin1") as f_obj:
    objective = f_obj.readlines()

In [5]:
# Create a DataFrame with labels
subj_df = pd.DataFrame({'text': subjective, 'target': 1})
obj_df = pd.DataFrame({'text': objective, 'target': 0})

In [6]:
# Combine datasets
data = pd.concat([subj_df, obj_df], ignore_index=True)

In [7]:
data.head(3)

Unnamed: 0,text,target
0,"smart and alert , thirteen conversations about...",1
1,"color , musical bounce and warm seas lapping o...",1
2,it is not a mass-market entertainment but an u...,1


In [11]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
stop_words = set(stopwords.words("english"))
punct_table = str.maketrans("", "", string.punctuation)

def preprocess(text):
    text = text.lower().translate(punct_table)
    tokens = tokenizer.tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    return " ".join(tokens)

In [12]:
# Apply preprocessing
data["clean_text"] = data["text"].apply(preprocess)

In [13]:
# Vectorize using Binary Bag-of-Words
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(data["clean_text"])

In [14]:
# Convert to DataFrame and rename columns to f_1, f_2, ...
X_df = pd.DataFrame(X.toarray(), columns=[f"f_{i+1}" for i in range(X.shape[1])])

In [15]:
X_df.head(3)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_22403,f_22404,f_22405,f_22406,f_22407,f_22408,f_22409,f_22410,f_22411,f_22412
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
pd.unique(X_df.values.ravel())

array([0, 1])

In [17]:
# Combine with target
final_df = pd.concat([X_df, data["target"].reset_index(drop=True)], axis=1)

In [18]:
final_df.head(3)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_22404,f_22405,f_22406,f_22407,f_22408,f_22409,f_22410,f_22411,f_22412,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [19]:
# If y is a column in a DataFrame (e.g., df["target"])
print(final_df["target"].value_counts(normalize=True))

target
1    0.5
0    0.5
Name: proportion, dtype: float64


In [20]:
# Split data by class
minority_df = final_df[final_df["target"] == 1]
majority_df = final_df[final_df["target"] == 0]

# Sample 20% of minority class
minority_sample = minority_df.sample(frac=0.2, random_state=42)

# Concatenate sampled minority and full majority
imbalanced_df = pd.concat([minority_sample, majority_df], ignore_index=True)

# Shuffle the resulting DataFrame
#imbalanced_df = imbalanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Optional: check new class balance
print(imbalanced_df["target"].value_counts(normalize=True))


target
0    0.833333
1    0.166667
Name: proportion, dtype: float64


In [21]:
imbalanced_df.head(3)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_22404,f_22405,f_22406,f_22407,f_22408,f_22409,f_22410,f_22411,f_22412,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [22]:
imbalanced_df.columns = [str(col) for col in imbalanced_df.columns]

In [23]:
imbalanced_df.to_parquet(
    path + "/processed_dataset.parquet",
    engine="pyarrow",
    compression="snappy",
    index=False
)