In [1]:
%run data_preprocessing.ipynb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
# Step 1: Label Encoding
def encode_labels(data):
    label_encoder = LabelEncoder()
    data['label_encoded'] = label_encoder.fit_transform(data['label']) 
    return data

# Step 2: Split the data into features (X) and labels (y), then into training and testing sets
def split_data(data):
    """
    Split the data into features (X) and target labels (y), then into training and testing sets.
    :param data: DataFrame with document vectors and labels.
    :return: Split data: X_train, X_test, y_train, y_test.
    """
    # Features: Document vectors (X)
    X = list(data['document_vector'])
    
    # Target: Encoded labels (y)
    y = data['label_encoded']
    
    # Split into training (80%) and testing (20%) sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    return X_train, X_test, y_train, y_test

# Step 3: Train a Logistic Regression Classifier for multi-class classification
def train_classifier(X_train, y_train):
    """
    Train a Logistic Regression model for multi-class classification.
    :param X_train: Training features (document vectors).
    :param y_train: Training labels (encoded labels).
    :return: Trained model.
    """
    model = LogisticRegression(max_iter=1000, multi_class='ovr')  # One-vs-Rest strategy
    model.fit(X_train, y_train)
    return model
    
# Step 4: Evaluate the trained model
def evaluate_classifier(model, X_test, y_test):
    """
    Evaluate the trained model on the test data.
    :param model: Trained model.
    :param X_test: Testing features (document vectors).
    :param y_test: True labels for the test data.
    """
    # Make predictions on the test set
    y_pred = model.predict(X_test)
    
    # Calculate and display accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    # Classification report (precision, recall, F1-score)
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=["spam", "ham", ]))

file_path = 'sms_spam.csv'
data = preprocess_and_vectorize(file_path)

data = encode_labels(data)

# Step 2: Encode the labels
data = encode_labels(data)

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = split_data(data)

# Step 4: Train the classifier
model = train_classifier(X_train, y_train)

# Step 5: Evaluate the classifier
evaluate_classifier(model, X_test, y_test)


[nltk_data] Downloading package punkt to /Users/zat_km/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zat_km/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zat_km/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Accuracy: 91.39%
Classification Report:
              precision    recall  f1-score   support

        spam       0.91      1.00      0.95       965
         ham       0.98      0.37      0.53       150

    accuracy                           0.91      1115
   macro avg       0.95      0.68      0.74      1115
weighted avg       0.92      0.91      0.90      1115



