In [None]:
! pip install nltk

In [1]:
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import pandas as pd
import numpy as np

# Download stopwords if not already done
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
path = "/repos/smote_msfb/public_datasets/scale/scaledata/"
author = "Steve+Rhodes"
output_path = "/repos/smote_msfb/public_datasets/scale/"

In [3]:
# Set the path to the specific author's file
file_path = path + author + '/subj.' + author

# Read all lines (each line = one review)
with open(file_path, 'r', encoding='utf-8') as f:
    reviews = f.readlines()

# Optional: strip newline characters
reviews = [review.strip() for review in reviews]

# Example: print first 5 reviews
for i, review in enumerate(reviews[:2]):
    print(f"Review {i+1}:\n{review}\n")


Review 1:
this bit of lame physical humor is typical of disney's meet the deedles , a movie more to be endured that watched . and she's right , it is that bad . ) directed without any imagination by steve boyum , whose long background in film is mainly in stunts and in second unit direction , the film limps along at best . boyum attempts to keep the pace moving by staging stunts , stunts and more stunts . amazingly for someone with his background , he seems incapable of finding any fresh ones , and we have a car go off the road five different times -- maybe more . and then there is the script by james herzfeld , whose only other film , tapeheads from a decade ago , was so awful that it is considered a cult classic . meet the deedles , however , is painfully bad rather than laughably bad . it will probably be in and out of the theaters like a tornado and is in no danger of becoming a classic anything . herzfeld treats us to gratingly abysmal dialog that includes " your geyser's a geezer

In [4]:
# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

# Sample list: replace with your loaded reviews
# reviews = [...]

# Step 1: Text preprocessing
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
preprocessed_reviews = [preprocess_text(review) for review in reviews]

# Step 2: Binary Bag-of-Words vectorization
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(preprocessed_reviews)

# Optional: Convert to DataFrame for easier inspection
import pandas as pd
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [5]:
X_df.shape

(1770, 17802)

In [6]:
# Show first few rows
X_df.head(2)

Unnamed: 0,00,000,000pound,007,00am,01,02,03,04,05,...,zsa,zsigmond,zuber,zurinaga,zweibel,zwick,zwigoff,zwiller,zylberstein,zzzzzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
## Rename the covariate space as f_1
X_df.columns = [f"f_{i+1}" for i in range(X_df.shape[1])]

In [8]:
## Get the unique values across the entire covariate space
unique_values = pd.unique(X_df.values.ravel())
print(unique_values)

[0 1]


In [9]:
# Replace with the correct path to the file
file_path = path + author + "/rating." + author

# Read the ratings into a list of floats
with open(file_path, 'r') as f:
    ratings = [float(line.strip()) for line in f]

# Optionally convert to a DataFrame
ratings_df = pd.DataFrame(ratings, columns=["rating"])

In [10]:
# Display first few rows
ratings_df.head(3)

Unnamed: 0,rating
0,0.1
1,0.1
2,0.1


In [11]:
ratings_df.shape

(1770, 1)

In [12]:
# Create binary response variable based on threshold
ratings_df["target"] = (ratings_df["rating"] >= 0.8).astype(int)

# Print distribution of target classes
print("Counts:\n", ratings_df["target"].value_counts())
print("\nProportions:\n", ratings_df["target"].value_counts(normalize=True))


Counts:
 target
0    1483
1     287
Name: count, dtype: int64

Proportions:
 target
0    0.837853
1    0.162147
Name: proportion, dtype: float64


In [13]:
# Combine features and target into a single DataFrame
final_df = pd.concat([X_df, ratings_df["target"].reset_index(drop=True)], axis=1)

In [14]:
final_df.head(3)

Unnamed: 0,f_1,f_2,f_3,f_4,f_5,f_6,f_7,f_8,f_9,f_10,...,f_17794,f_17795,f_17796,f_17797,f_17798,f_17799,f_17800,f_17801,f_17802,target
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
# Drop columns where all values are 0
final_df = final_df.loc[:, (final_df != 0).any(axis=0)]

In [17]:
# final_df.to_csv(output_path + author +".csv")

# Save in Parquet format (using pyarrow)
final_df.to_parquet(
    output_path + author + ".parquet", 
    engine="pyarrow", 
    compression="snappy", 
    index=False
)