In [1]:
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np

# Download stopwords if not already done
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
path = "/domino/datasets/local/CustomerSegmentation/public_datasets/scale/scaledata/"
author = "Scott+Renshaw"
output_path = "/domino/datasets/local/CustomerSegmentation/public_datasets/scale/"

In [3]:
# Set the path to the specific author's file
file_path = path + author + '/subj.' + author

# Read all lines (each line = one review)
with open(file_path, 'r', encoding='utf-8') as f:
    reviews = f.readlines()

# Optional: strip newline characters
reviews = [review.strip() for review in reviews]

# Example: print first 5 reviews
for i, review in enumerate(reviews[:5]):
    print(f"Review {i+1}:\n{review}\n")


Review 1:
i'm guessing -- and from the available evidence , it's not a great guess -- that burn hollywood burn began life as an insider satire of hollywood excess , stupidity , ego and power-mongering . if so , the film that ended up on the screen ranks as one of the most spectacularly ironic unintentional jokes in film history . that left a film satire in the hands of writer joe eszterhas , as humorless a hack as ever put finger to word processor . and you could just smell the disaster brewing . if you're unfortunate enough to sit through burn hollywood burn , you'll still be smelling that disaster long after the lights come up . it's bad enough that burn hollywood burn has -- literally -- not a single laugh for its entire , blissfully brief 84 minutes . what makes it even worse is eszterhas' insistence upon telling every bad joke not once , but twice or even three times . michael ovitz references , showgirls references , oral sex references , whoopi goldberg/ted danson references -- 

In [4]:
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Sample list: replace with your loaded reviews
# reviews = [...]

# Step 1: Text preprocessing
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
preprocessed_reviews = [preprocess_text(review) for review in reviews]

# Step 2: Binary Bag-of-Words vectorization
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(preprocessed_reviews)

# Optional: Convert to DataFrame for easier inspection
import pandas as pd
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())

In [5]:
X_df.shape

(902, 17443)

In [6]:
# Show first few rows
X_df.head(2)

Unnamed: 0,00,000,000yearold,007,05,10,100,100th,101,102,...,zoomin,zoomtocloseup,zord,zorro,zs,zsigmond,zucker,zuckerabrahamszuck,zweibel,zwick
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
## Rename the covariate space as f_1
X_df.columns = [f"f_{i+1}" for i in range(X_df.shape[1])]

In [8]:
## Get the unique values across the entire covariate space
unique_values = pd.unique(X_df.values.ravel())
print(unique_values)

[0 1]


In [9]:
# Replace with the correct path to the file
file_path = path + author + "/rating." + author

# Read the ratings into a list of floats
with open(file_path, 'r') as f:
    ratings = [float(line.strip()) for line in f]

# Optionally convert to a DataFrame
ratings_df = pd.DataFrame(ratings, columns=["rating"])

In [10]:
# Display first few rows
ratings_df.head(3)

Unnamed: 0,rating
0,0.0
1,0.0
2,0.0


In [11]:
ratings_df.shape

(902, 1)

In [12]:
# Create binary response variable based on threshold
ratings_df["target"] = (ratings_df["rating"] >= 0.8).astype(int)

# Print distribution of target classes
print("Counts:\n", ratings_df["target"].value_counts())
print("\nProportions:\n", ratings_df["target"].value_counts(normalize=True))


Counts:
 0    744
1    158
Name: target, dtype: int64

Proportions:
 0    0.824834
1    0.175166
Name: target, dtype: float64


In [13]:
# Combine features and target into a single DataFrame
final_df = pd.concat([X_df, ratings_df["target"].reset_index(drop=True)], axis=1)

In [14]:
final_df.head(3)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_17435,f_17436,f_17437,f_17438,f_17439,f_17440,f_17441,f_17442,f_17443,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
final_df.to_csv(output_path + author +".csv")