词袋模型

In [None]:
import pandas as pd
import warnings
# Ignore version-related warnings (no errors in original code, retain this logic)
warnings.filterwarnings('ignore')

# ---------------------- 1. Data Loading 
data_train = pd.read_csv('/Users/jujusmacbook/Documents/NLP_Lab/Data/train.csv')
data_test = pd.read_csv('/Users/jujusmacbook/Documents/NLP_Lab/Data/test.csv')

# Data exploration 
print("Training Set Basic Info:")
print(f"Training set shape: {data_train.shape} (1600 rows, including 'id', 'review', 'sentiment' columns)")
print(f"Training set sentiment label distribution:\n{data_train['sentiment'].value_counts(normalize=True).round(4)}")  # Verify data balance (🔶3-12)
print("\nTest Set Basic Info:")
print(f"Test set shape: {data_test.shape} (5001 rows, including 'id', 'review' columns)")

# ---------------------- 2. Data Preprocessing: Extract Text and Labels 
train_sentences = data_train['review']
label = data_train['sentiment']  # 1600 sentiment labels (1 = positive, 0 = negative)

# Extract text ('review') from test set; Note: Original code mistakenly used train_sentences here, corrected to data_test['review'] (key correction)
test_sentences = data_test['review']

# Concatenate training and test set text (for Bag-of-Words model fitting, avoid unseen words in test set)
sentences = pd.concat([train_sentences, test_sentences])
print(f"\nCombined corpus size: {sentences.shape} (3200 rows total, including train + test text)")

# ---------------------- 3. Text Feature Engineering: Bag-of-Words Model Construction 
stop_words = open('/Users/jujusmacbook/Documents/NLP_Lab/Data/stop_words.txt', encoding='utf-8').read().splitlines()
print(f"\nStopword list size: {len(stop_words)} stopwords (e.g., 'the', 'and', etc.)")

# Build Bag-of-Words model (CountVectorizer), no errors in original code, retain core parameters
from sklearn.feature_extraction.text import CountVectorizer
co = CountVectorizer(
    analyzer='word',          # Split text by "word" (suitable for Latin languages)
    ngram_range=(1, 2),       # Retain unigrams (e.g., 'good') and bigrams (e.g., 'very good') to mitigate word order loss
    stop_words=stop_words,    # Incorporate stopword filtering
    max_features=5000         # Retain high-frequency words to avoid feature dimensionality explosion
)

# Fit Bag-of-Words model with combined corpus 
co.fit(sentences)
print(f"Bag-of-Words model vocabulary size: {len(co.vocabulary_)} feature words (top 5000 high-frequency words retained)")

# ---------------------- 4. Train Set Splitting: Divide into Training Subset and Validation Subset (Meets Document 🔶3-22 "Analyze Validation Set Performance") ----------------------
from sklearn.model_selection import train_test_split
# Split training set into "training subset (80%)" and "validation subset (20%)", ensure consistent label distribution (stratify=label)
x_train, x_test, y_train, y_test = train_test_split(
    train_sentences, label, random_state=1234, stratify=label
)
print(f"\nSplit sizes: Training subset {x_train.shape}, Validation subset {x_test.shape}")

# Convert features with Bag-of-Words model (training subset + validation subset)
x_train_bow = co.transform(x_train).toarray()  # Feature transformation for training subset
x_test_bow = co.transform(x_test).toarray()    # Feature transformation for validation subset
print(f"Bag-of-Words feature dimensions: Training subset {x_train_bow.shape}, Validation subset {x_test_bow.shape}")

# ---------------------- 5. Model Training: Baseline Model Comparison 
# 5.1 Logistic Regression Model (Baseline Version)
from sklearn.linear_model import LogisticRegression
lg1 = LogisticRegression()
lg1.fit(x_train_bow, y_train)
lg1_acc = lg1.score(x_test_bow, y_test)
print(f"\nBaseline Logistic Regression (Bag-of-Words features): Validation set accuracy {lg1_acc:.4f}")

# 5.2 Naive Bayes Model (for comparison)
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(x_train_bow, y_train)
nb_acc = classifier.score(x_test_bow, y_test)
print(f"Naive Bayes Model (Bag-of-Words features): Validation set accuracy {nb_acc:.4f}")

# ---------------------- 6. Model Optimization: Logistic Regression Hyperparameter Grid Search
from sklearn.model_selection import GridSearchCV
# Define hyperparameter search range (regularization strength C, duality dual)
param_grid = {
    'C': range(1, 10),        # Larger C = weaker regularization
    'dual': [True, False]     # Dual form (suitable for high-dimensional features)
}

# Initialize Grid Search (3-fold cross-validation, use all CPU cores for acceleration)
lgGS = LogisticRegression()
grid = GridSearchCV(lgGS, param_grid=param_grid, cv=3, n_jobs=-1)
# Train Grid Search model with Bag-of-Words features (based on training subset)
grid.fit(x_train_bow, y_train)

# Output optimal hyperparameters and corresponding validation set accuracy
print(f"\nGrid Search optimal hyperparameters: {grid.best_params_}")
lg_final = grid.best_estimator_  # Get Logistic Regression model with optimal hyperparameters
lg_final_acc = lg_final.score(x_test_bow, y_test)
print(f"Optimal Logistic Regression (Bag-of-Words features): Validation set accuracy {lg_final_acc:.4f}")

# ---------------------- 7. Test Set Prediction: Generate Sentiment Labels 
# Convert test set text features with Bag-of-Words model (consistent with training process)
test_X_bow = co.transform(data_test['review']).toarray()
print(f"\nTest set Bag-of-Words feature dimensions: {test_X_bow.shape} (5001 rows, matching test set size)")

# Predict test set sentiment labels with optimal Logistic Regression model
predictions = lg_final.predict(test_X_bow)
print(f"Test set prediction result size: {predictions.shape} (5001 sentiment labels, 0 = negative, 1 = positive)")

# ---------------------- 8. Generate Submission File 
# Add predicted labels to test set
data_test.loc[:, 'sentiment'] = predictions
# Extract "id" and "sentiment" columns as required by the document to generate final submission data
final_data = data_test.loc[:, ['id', 'sentiment']]

# View first 5 rows of submission file (verify format)
print(f"\nFirst 5 rows of final submission file:")
print(final_data.head())

# Save as CSV file
final_data.to_csv('/Users/jujusmacbook/Documents/NLP_Lab/Data/Juju_StudentID_predictions.csv', index=False)
print(f"\nSubmission file saved: Zhu Xinyu_25118165g_predictions.csv ")