In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import re
import matplotlib.pyplot as plt
import pickle
import os
from datetime import datetime
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

class NeuralNetwork:
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.initialize_weights()

    def initialize_weights(self):
        self.weights1 = np.random.randn(self.input_size, self.hidden_size) * np.sqrt(2.0/self.input_size)
        self.weights2 = np.random.randn(self.hidden_size, self.output_size) * np.sqrt(2.0/self.hidden_size)
        self.bias1 = np.zeros((1, self.hidden_size))
        self.bias2 = np.zeros((1, self.output_size))

    def get_parameters(self):
        return {
            'weights1': self.weights1.copy(),
            'weights2': self.weights2.copy(),
            'bias1': self.bias1.copy(),
            'bias2': self.bias2.copy()
        }

    def set_parameters(self, parameters):
        self.weights1 = parameters['weights1'].copy()
        self.weights2 = parameters['weights2'].copy()
        self.bias1 = parameters['bias1'].copy()
        self.bias2 = parameters['bias2'].copy()

    def relu(self, x):
        return np.maximum(0, x)

    def relu_derivative(self, x):
        return np.where(x > 0, 1, 0)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def forward(self, X):
        self.layer1 = np.dot(X, self.weights1) + self.bias1
        self.layer1_activation = self.relu(self.layer1)
        self.layer2 = np.dot(self.layer1_activation, self.weights2) + self.bias2
        self.output = self.softmax(self.layer2)
        return self.output

    def backward(self, X, y, output):
        batch_size = X.shape[0]
        self.output_error = output - y

        self.layer2_delta = self.output_error
        self.layer1_error = np.dot(self.layer2_delta, self.weights2.T)
        self.layer1_delta = self.layer1_error * self.relu_derivative(self.layer1)

        # Add L2 regularization
        lambda_reg = 0.01
        self.weights2 -= self.learning_rate * (np.dot(self.layer1_activation.T, self.layer2_delta) / batch_size + lambda_reg * self.weights2)
        self.bias2 -= self.learning_rate * np.mean(self.layer2_delta, axis=0, keepdims=True)
        self.weights1 -= self.learning_rate * (np.dot(X.T, self.layer1_delta) / batch_size + lambda_reg * self.weights1)
        self.bias1 -= self.learning_rate * np.mean(self.layer1_delta, axis=0, keepdims=True)

    def calculate_loss(self, y_true, y_pred):
        return -np.mean(np.sum(y_true * np.log(y_pred + 1e-15), axis=1))

    def train_batch(self, X, y, batch_size=32):
        indices = np.random.permutation(len(X))
        total_loss = 0

        for start_idx in range(0, len(X), batch_size):
            batch_indices = indices[start_idx:start_idx + batch_size]
            X_batch = X[batch_indices]
            y_batch = y[batch_indices]

            output = self.forward(X_batch)
            self.backward(X_batch, y_batch, output)
            total_loss += self.calculate_loss(y_batch, output)

        return total_loss / (len(X) // batch_size)

def preprocess_text(text):
    # Convert to lowercase and remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
    return text

class ModelCheckpoint:
    def __init__(self, filepath, verbose=1):
        self.filepath = filepath
        self.verbose = verbose
        self.best_val_loss = float('inf')

    def save_checkpoint(self, model, val_loss, epoch):
        if val_loss < self.best_val_loss:
            self.best_val_loss = val_loss
            parameters = model.get_parameters()
            with open(self.filepath, 'wb') as f:
                pickle.dump(parameters, f)
            if self.verbose:
                print(f'\nEpoch {epoch}: Validation loss improved to {val_loss:.4f}, saving model to {self.filepath}')
            return True
        return False

def load_and_preprocess_data(train_path, test_path):
    # Load datasets
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    # Combine title and description
    train_df['text'] = train_df['Title'] + ' ' + train_df['Description']
    test_df['text'] = test_df['Title'] + ' ' + test_df['Description']

    # Preprocess text
    train_df['text'] = train_df['text'].apply(preprocess_text)
    test_df['text'] = test_df['text'].apply(preprocess_text)

    # Convert class index to one-hot encoding
    label_encoder = LabelEncoder()
    train_labels = label_encoder.fit_transform(train_df['Class Index'] - 1)
    test_labels = label_encoder.transform(test_df['Class Index'] - 1)

    train_labels_onehot = np.eye(4)[train_labels]
    test_labels_onehot = np.eye(4)[test_labels]

    # TF-IDF Vectorization
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_df['text']).toarray()
    X_test = vectorizer.transform(test_df['text']).toarray()

    # Save vectorizer to Google Drive
    vectorizer_path = '/content/drive/MyDrive/ML Proj Dataset/tfidf_vectorizer.pkl'
    with open(vectorizer_path, 'wb') as f:
        pickle.dump(vectorizer, f)
    print(f"Vectorizer saved to: {vectorizer_path}")

    return X_train, train_labels_onehot, X_test, test_labels_onehot, label_encoder

def create_training_curves(train_losses, val_losses, output_dir):
    plt.figure(figsize=(10, 6))
    plt.plot(train_losses, label='Training Loss')
    plt.plot(val_losses, label='Validation Loss')
    plt.title('Training and Validation Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig(os.path.join(output_dir, 'training_curves.png'))
    plt.close()

def main():
    # Create directory for model artifacts in Google Drive
    base_dir = '/content/drive/MyDrive/ML Proj Dataset'
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_dir = os.path.join(base_dir, f'model_artifacts_{timestamp}')
    os.makedirs(output_dir, exist_ok=True)

    # Load and preprocess data
    X_train, y_train, X_test, y_test, label_encoder = load_and_preprocess_data(
        os.path.join(base_dir, 'train.csv'),
        os.path.join(base_dir, 'test.csv')
    )

    # Split training data into train and validation sets
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

    # Neural Network parameters
    input_size = X_train.shape[1]
    hidden_size = 256
    output_size = 4
    learning_rate = 0.001
    epochs = 100
    batch_size = 32
    early_stopping_patience = 10

    # Initialize neural network
    nn = NeuralNetwork(input_size, hidden_size, output_size, learning_rate)

    # Initialize checkpoint with updated path
    checkpoint = ModelCheckpoint(os.path.join(output_dir, 'best_model.pkl'))

    # Training loop with early stopping
    train_losses = []
    val_losses = []
    patience_counter = 0

    for epoch in range(epochs):
        # Training
        train_loss = nn.train_batch(X_train, y_train, batch_size)
        train_losses.append(train_loss)

        # Validation
        val_predictions = nn.forward(X_val)
        val_loss = nn.calculate_loss(y_val, val_predictions)
        val_losses.append(val_loss)

        # Save checkpoint if validation loss improves
        if checkpoint.save_checkpoint(nn, val_loss, epoch):
            patience_counter = 0
        else:
            patience_counter += 1

        # Print progress
        print(f'Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}')

        # Early stopping
        if patience_counter >= early_stopping_patience:
            print(f'\nEarly stopping triggered after {epoch+1} epochs')
            break

    # Create and save training curves
    create_training_curves(train_losses, val_losses, output_dir)

    # Load best model for evaluation
    with open(os.path.join(output_dir, 'best_model.pkl'), 'rb') as f:
        best_parameters = pickle.load(f)
    nn.set_parameters(best_parameters)

    # Evaluate on test set
    test_predictions = nn.forward(X_test)
    predicted_labels = np.argmax(test_predictions, axis=1)
    true_labels = np.argmax(y_test, axis=1)

    # Calculate and save metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    class_names = ['World', 'Sports', 'Business', 'Science']
    classification_rep = classification_report(true_labels, predicted_labels, target_names=class_names)

    # Save results to file
    results_file = os.path.join(output_dir, 'results.txt')
    with open(results_file, 'w') as f:
        f.write(f'Test Accuracy: {accuracy * 100:.2f}%\n\n')
        f.write('Classification Report:\n')
        f.write(classification_rep)

    print(f'\nTest Accuracy: {accuracy * 100:.2f}%')
    print('\nClassification Report:')
    print(classification_rep)
    print(f'\nAll model artifacts saved in: {output_dir}')

if __name__ == "__main__":
    main()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Vectorizer saved to: /content/drive/MyDrive/ML Proj Dataset/tfidf_vectorizer.pkl

Epoch 0: Validation loss improved to 1.3824, saving model to /content/drive/MyDrive/ML Proj Dataset/model_artifacts_20241125_082544/best_model.pkl
Epoch 1/100, Train Loss: 1.3845, Val Loss: 1.3824

Epoch 1: Validation loss improved to 1.3784, saving model to /content/drive/MyDrive/ML Proj Dataset/model_artifacts_20241125_082544/best_model.pkl
Epoch 2/100, Train Loss: 1.3804, Val Loss: 1.3784

Epoch 2: Validation loss improved to 1.3742, saving model to /content/drive/MyDrive/ML Proj Dataset/model_artifacts_20241125_082544/best_model.pkl
Epoch 3/100, Train Loss: 1.3762, Val Loss: 1.3742

Epoch 3: Validation loss improved to 1.3698, saving model to /content/drive/MyDrive/ML Proj Dataset/model_artifacts_20241125_082544/best_model.pkl
Epoch 4/100, Train Loss: 1.3719, Val Loss: 1.369

In [3]:
import pickle
import numpy as np
import re

class TextPredictor:
    def __init__(self, model_path, vectorizer_path):
        """
        Initialize the predictor with paths to the saved model and vectorizer.

        Args:
            model_path (str): Path to the saved model pickle file
            vectorizer_path (str): Path to the saved vectorizer pickle file
        """
        self.class_names = ['World', 'Sports', 'Business', 'Science']
        self.model = None
        self.vectorizer = None
        self.load_artifacts(model_path, vectorizer_path)

    def load_artifacts(self, model_path, vectorizer_path):
        """Load the saved model and vectorizer."""
        try:
            # Load the vectorizer
            with open(vectorizer_path, 'rb') as f:
                self.vectorizer = pickle.load(f)

            # Load the model parameters
            with open(model_path, 'rb') as f:
                model_params = pickle.load(f)

            # Recreate the neural network with loaded parameters
            input_size = model_params['weights1'].shape[0]
            hidden_size = model_params['weights1'].shape[1]
            output_size = model_params['weights2'].shape[1]

            self.model = NeuralNetwork(input_size, hidden_size, output_size)
            self.model.set_parameters(model_params)

            print("Model and vectorizer loaded successfully!")

        except Exception as e:
            raise Exception(f"Error loading model artifacts: {str(e)}")

    def preprocess_text(self, text):
        """Preprocess the input text."""
        # Convert to lowercase and remove special characters
        text = re.sub(r'[^a-zA-Z\s]', '', str(text).lower())
        return text

    def predict(self, text):
        """
        Predict the category of the input text.

        Args:
            text (str): Input text to classify

        Returns:
            dict: Prediction results containing category and confidence scores
        """
        try:
            # Preprocess the text
            processed_text = self.preprocess_text(text)

            # Transform text using the vectorizer
            text_vectorized = self.vectorizer.transform([processed_text]).toarray()

            # Get model predictions
            predictions = self.model.forward(text_vectorized)

            # Get the predicted class and probabilities
            predicted_class_idx = np.argmax(predictions[0])
            probabilities = predictions[0]

            # Create results dictionary
            results = {
                'predicted_category': self.class_names[predicted_class_idx],
                'confidence': float(probabilities[predicted_class_idx]),
                'probabilities': {
                    category: float(prob)
                    for category, prob in zip(self.class_names, probabilities)
                }
            }

            return results

        except Exception as e:
            raise Exception(f"Error making prediction: {str(e)}")

class NeuralNetwork:
    """Simplified version of the neural network for prediction only."""
    def __init__(self, input_size, hidden_size, output_size, learning_rate=0.01):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.learning_rate = learning_rate
        self.initialize_weights()

    def initialize_weights(self):
        self.weights1 = np.zeros((self.input_size, self.hidden_size))
        self.weights2 = np.zeros((self.hidden_size, self.output_size))
        self.bias1 = np.zeros((1, self.hidden_size))
        self.bias2 = np.zeros((1, self.output_size))

    def set_parameters(self, parameters):
        self.weights1 = parameters['weights1'].copy()
        self.weights2 = parameters['weights2'].copy()
        self.bias1 = parameters['bias1'].copy()
        self.bias2 = parameters['bias2'].copy()

    def relu(self, x):
        return np.maximum(0, x)

    def softmax(self, x):
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def forward(self, X):
        layer1 = np.dot(X, self.weights1) + self.bias1
        layer1_activation = self.relu(layer1)
        layer2 = np.dot(layer1_activation, self.weights2) + self.bias2
        output = self.softmax(layer2)
        return output

In [6]:
# 1. First, set up your file paths correctly
model_path = "/content/drive/MyDrive/ML Proj Dataset/model_artifacts_20241125_082544/best_model.pkl"    # Replace with your model file path
vectorizer_path = "/content/drive/MyDrive/ML Proj Dataset/tfidf_vectorizer.pkl"  # Replace with your vectorizer file path

# 2. Create the predictor
predictor = TextPredictor(model_path, vectorizer_path)

# 3. Input your sentence and get prediction
sentence = """
While the synergy between our vertically integrated strategies continues to amplify market penetration,
the Q4 projections remain contingent on the elasticity of discretionary consumer spending amidst fluctuating interest rate policies.
"""  # Put your sentence here
result = predictor.predict(sentence)

# 4. See the results
print(f"\nText: {sentence}")
print(f"Predicted Category: {result['predicted_category']}")
print(f"Confidence: {result['confidence']:.2%}")
print("\nProbabilities for all categories:")
for category, prob in result['probabilities'].items():
    print(f"{category}: {prob:.2%}")

Model and vectorizer loaded successfully!

Text: 
While the synergy between our vertically integrated strategies continues to amplify market penetration, 
the Q4 projections remain contingent on the elasticity of discretionary consumer spending amidst fluctuating interest rate policies.

Predicted Category: Business
Confidence: 38.42%

Probabilities for all categories:
World: 16.87%
Sports: 14.65%
Business: 38.42%
Science: 30.06%


In [None]:
import os

# Check the folder in Google Drive
output_dir = '/content/drive/MyDrive/model_artifacts_20231125_153015'
if os.path.exists(output_dir):
    print(f'Folder found: {output_dir}')
else:
    print(f'Folder not found: {output_dir}')


Folder not found: /content/drive/MyDrive/model_artifacts_20231125_153015
