In [None]:
! pip install nltk

In [2]:
import os
import tarfile
import pandas as pd

In [3]:
# Path to your downloaded .tar.gz file
file_path = "/repos/smote_msfb/public_datasets/polarity/review_polarity.tar.gz"

In [4]:
# Extract the archive
with tarfile.open(file_path, "r:gz") as tar:
    tar.extractall(path="./polarity_dataset")

In [5]:
# Folder paths
pos_folder = "./polarity_dataset/txt_sentoken/pos/"
neg_folder = "./polarity_dataset/txt_sentoken/neg/"

In [6]:
# Read all files and assign sentiment label
def read_reviews(folder, label):
    reviews = []
    for file in os.listdir(folder):
        with open(os.path.join(folder, file), encoding="utf-8") as f:
            text = f.read()
            reviews.append((text, label))
    return reviews

In [7]:
# Load both positive and negative reviews
pos_reviews = read_reviews(pos_folder, 1)
neg_reviews = read_reviews(neg_folder, 0)

In [8]:
# Combine and shuffle
all_reviews = pos_reviews + neg_reviews
df = pd.DataFrame(all_reviews, columns=["review", "sentiment"])

In [9]:
df.shape

(2000, 2)

In [None]:
# Optional: Shuffle rows
# df = df.sample(frac=1, random_state=42).reset_index(drop=True)

In [10]:
# Show sample
df.head()

Unnamed: 0,review,sentiment
0,pitch black is a sheep in wolf's clothing . \n...,1
1,"voices . . . . . trey parker , matt stone , ge...",1
2,"along his carreer , mel gibson has collected s...",1
3,"when quentin tarantino made "" pulp fiction "" ,...",1
4,john cusack is the kind of actor who seems to ...,1


In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)  # Remove punctuation/numbers
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return " ".join(tokens)

df["clean_review"] = df["review"].apply(clean_text)

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(binary=True, max_features=15000)  # or remove max_features for full vocabulary
X = vectorizer.fit_transform(df["clean_review"])
y = df["sentiment"].values

In [13]:
# === Step 3: Combine features and label into one DataFrame ===
final_data = pd.DataFrame(X.toarray(), columns=[f"f_{i}" for i in range(X.shape[1])])
final_data["target"] = y

In [14]:
# Identify covariate columns (start with "f_")
covariate_cols = [col for col in final_data.columns if col.startswith("f_")]

# Keep only covariate columns that are not all zero
nonzero_covariates = final_data[covariate_cols].loc[:, (final_data[covariate_cols] != 0).any(axis=0)]

# Combine with the target column
final_data_cleaned = pd.concat([nonzero_covariates, final_data["target"]], axis=1)

In [18]:
final_data.head(3)

Unnamed: 0,f_0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,...,f_14991,f_14992,f_14993,f_14994,f_14995,f_14996,f_14997,f_14998,f_14999,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [17]:
# Drop columns where all values are 0
final_data = final_data.loc[:, (final_data != 0).any(axis=0)]

In [19]:
# Counts of each class
print("Counts:\n", final_data["target"].value_counts())

# Proportions of each class
print("\nProportions:\n", final_data["target"].value_counts(normalize=True))

Counts:
 target
1    1000
0    1000
Name: count, dtype: int64

Proportions:
 target
1    0.5
0    0.5
Name: proportion, dtype: float64


In [20]:
import os
import zipfile

In [21]:
# Separate majority and minority classes
majority_class = final_data[final_data["target"] == 1]
minority_class = final_data[final_data["target"] == 0]

# Define percentages
percentages = [0.10, 0.15, 0.20, 0.25, 0.30]

# Output directory (you can change this)
output_dir = "/repos/smote_msfb/public_datasets/polarity/"
os.makedirs(output_dir, exist_ok=True)

# Create and save each variant
for pct in percentages:
    n_samples = int(len(majority_class) * pct)
    minority_sample = minority_class.sample(n=n_samples, random_state=42)
    
    # Combine with full majority
    imbalanced_data = pd.concat([majority_class, minority_sample], ignore_index=True)
    
    # Shuffle
    imbalanced_data = imbalanced_data.sample(frac=1.0, random_state=42).reset_index(drop=True)
    
    # File path (Parquet instead of CSV)
    parquet_filename = f"imbalanced_{int(pct * 100)}pct.parquet"
    parquet_path = os.path.join(output_dir, parquet_filename)
    
    # Save Parquet file with compression
    imbalanced_data.to_parquet(
        parquet_path, 
        engine="pyarrow", 
        compression="snappy", 
        index=False
    )

print("All 5 imbalanced datasets saved as Parquet files (compressed).")


All 5 imbalanced datasets saved as Parquet files (compressed).


In [22]:
# Separate majority and minority classes
majority_class = final_data[final_data["target"] == 1]
minority_class = final_data[final_data["target"] == 0]

# Define percentages
percentages = [0.10, 0.15, 0.20, 0.25, 0.30]

# Output directory (you can change this)
output_dir = "/repos/smote_msfb/public_datasets/polarity/"
os.makedirs(output_dir, exist_ok=True)

# Create and save each variant
for pct in percentages:
    n_samples = int(len(majority_class) * pct)
    minority_sample = minority_class.sample(n=n_samples, random_state=42)
    
    # Combine with full majority
    imbalanced_data = pd.concat([majority_class, minority_sample], ignore_index=True)
    
    # Shuffle
    imbalanced_data = imbalanced_data.sample(frac=1.0, random_state=42).reset_index(drop=True)

    # File paths
    csv_filename = f"imbalanced_{int(pct * 100)}pct.csv"
    zip_filename = f"imbalanced_{int(pct * 100)}pct.zip"
    csv_path = os.path.join(output_dir, csv_filename)
    zip_path = os.path.join(output_dir, zip_filename)

    # Save CSV
    imbalanced_data.to_csv(csv_path, index=False)

    # Zip the file
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        zipf.write(csv_path, arcname=csv_filename)

    # Remove the raw CSV to keep only zipped version
    os.remove(csv_path)

print("All 5 imbalanced datasets saved as zipped CSV files.")

All 5 imbalanced datasets saved as zipped CSV files.
