In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_fscore_support

# Set random seed for reproducibility
np.random.seed(42)

## 1. Create synthetic dataset

In [5]:
def generate_synthetic_data(num_samples=100):
    # Sample positive (good) and negative (bad) feedback phrases
    positive_phrases = [
        "Great product, works perfectly!",
        "Excellent quality, very happy with purchase",
        "Fast delivery and good packaging",
        "Exactly as described, would buy again",
        "Highly recommend this item",
        "Perfect fit, love the design",
        "Works better than expected",
        "Good value for the price",
        "Customer service was helpful",
        "Arrived early, no issues"
    ]
    
    negative_phrases = [
        "Poor quality, broke immediately",
        "Not as described, very disappointed",
        "Late delivery and damaged package",
        "Waste of money, doesn't work",
        "Would not recommend to anyone",
        "Wrong size, difficult to return",
        "Defective product, needs replacement",
        "Overpriced for what you get",
        "Customer service was unhelpful",
        "Missing parts, incomplete order"
    ]
    
    # Generate synthetic samples by adding some variation
    samples = []
    labels = []
    
    for _ in range(num_samples // 2):
        # Generate good samples
        base_phrase = np.random.choice(positive_phrases)
        varied_phrase = base_phrase.replace("!", np.random.choice(["", " :)", ""]))
        samples.append(varied_phrase)
        labels.append(1)  # 1 for good
        
        # Generate bad samples
        base_phrase = np.random.choice(negative_phrases)
        varied_phrase = base_phrase.replace(",", np.random.choice([",", "...", ""]))
        samples.append(varied_phrase)
        labels.append(0)  # 0 for bad
    
    # Convert to DataFrame
    df = pd.DataFrame({'text': samples, 'label': labels})
    
    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    
    return df

# Generate the dataset
df = generate_synthetic_data(100)

## 2. Preprocess text using TfidfVectorizer

In [8]:
vectorizer = TfidfVectorizer(
    max_features=300,
    lowercase=True,
    stop_words='english'
)

## 3. Split dataset into training (75%) and testing (25%) sets

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.25, random_state=42
)

# Vectorize the text data
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

## 4. Train Logistic Regression model

In [14]:
model = LogisticRegression(random_state=42)
model.fit(X_train_vec, y_train)

# Make predictions on test set
y_pred = model.predict(X_test_vec)

# Calculate evaluation metrics
precision, recall, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average='binary'
)

print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1-score: {f1:.3f}")

Precision: 1.000
Recall: 1.000
F1-score: 1.000


## 5. The requested function

In [17]:
def text_preprocess_vectorize(texts, vectorizer):
    """
    Preprocess and vectorize text samples using a fitted TfidfVectorizer.
    
    Args:
        texts (list): List of text samples to process
        vectorizer (TfidfVectorizer): Fitted TfidfVectorizer instance
        
    Returns:
        scipy.sparse.csr_matrix: Vectorized feature matrix
    """
    return vectorizer.transform(texts)

# Example usage of the function:
sample_texts = ["Great product", "Poor quality"]
vectorized_samples = text_preprocess_vectorize(sample_texts, vectorizer)
print("\nExample vectorization output shape:", vectorized_samples.shape)


Example vectorization output shape: (2, 56)
