In [67]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, precision_recall_curve
import joblib 


In [68]:
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nlp = spacy.load('en_core_web_sm')

In [69]:
def load_data(json_file_path):
    try:
        with open(json_file_path, 'r') as f:
            data = json.load(f)
        return pd.DataFrame(data)
    except FileNotFoundError:
        print(f"Error: File '{json_file_path}' not found.")
        exit(1)

In [70]:
# Create engineered features
def extract_features(text_series):
    features = pd.DataFrame(index=text_series.index)  # Ensure indices match
    
    # 1. Basic text features
    features['text_length'] = text_series.apply(len)
    features['word_count'] = text_series.apply(lambda x: len(x.split()))
    features['avg_word_length'] = text_series.apply(lambda x: np.mean([len(w) for w in x.split()]) if len(x.split()) > 0 else 0)
    
    # 2. Question-related features
    features['has_question_mark'] = text_series.apply(lambda x: 1 if '?' in x else 0)
    
    # 3. Check for question words
    question_words = ['what', 'who', 'when', 'where', 'why', 'how', 'which', 'whose']
    for word in question_words:
        features[f'starts_with_{word}'] = text_series.apply(
            lambda x: 1 if x.lower().strip().startswith(word) else 0
        )
    
    features['starts_with_question_word'] = features[[f'starts_with_{word}' for word in question_words]].max(axis=1)
    
    # 4. Check for question-asking verbs
    question_verbs = ['can', 'could', 'would', 'will', 'should', 'is', 'are', 'do', 'does', 'did']
    for verb in question_verbs:
        features[f'starts_with_{verb}'] = text_series.apply(
            lambda x: 1 if x.lower().strip().startswith(verb) else 0
        )
    
    features['starts_with_question_verb'] = features[[f'starts_with_{verb}' for verb in question_verbs]].max(axis=1)
    
    # 5. Check for summarization keywords
    summarization_words = ['summarize', 'summary', 'summarization', 'condense', 'shorten', 
                          'brief', 'overview', 'digest', 'recap', 'synopsis', 'tldr', 
                          'key points', 'main points', 'highlight', 'gist', 'bullet']
    
    features['contains_summarization_word'] = text_series.apply(
        lambda x: 1 if any(word in x.lower() for word in summarization_words) else 0
    )
    
    # 6. NLP-based features using spaCy
    # Initialize columns for NLP features to ensure consistent shape
    nlp_feature_columns = ['VERB', 'NOUN', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'NUM', 
                         'first_token_is_verb', 'has_imperative', 'sentence_count']
    for col in nlp_feature_columns:
        features[col] = 0
    
    # Process each text with spaCy
    for idx, text in text_series.items():
        try:
            doc = nlp(text[:5000])  # Limit to 5000 chars to avoid memory issues
            
            # Count different parts of speech
            pos_counts = {
                'VERB': 0, 'NOUN': 0, 'ADJ': 0, 'ADV': 0, 
                'PRON': 0, 'DET': 0, 'ADP': 0, 'NUM': 0
            }
            
            for token in doc:
                if token.pos_ in pos_counts:
                    pos_counts[token.pos_] += 1
            
            # Update feature dataframe with POS counts
            for pos, count in pos_counts.items():
                features.at[idx, pos] = count
            
            # Check if the first token is a verb (common in questions)
            features.at[idx, 'first_token_is_verb'] = 1 if len(doc) > 0 and doc[0].pos_ == 'VERB' else 0
            
            # Check if there's an imperative verb (command) - common in summarization requests
            has_imperative = 0
            if len(doc) > 0 and doc[0].pos_ == 'VERB':
                has_subject = any(token.dep_ == 'nsubj' for token in doc)
                if not has_subject:
                    has_imperative = 1
            
            features.at[idx, 'has_imperative'] = has_imperative
            features.at[idx, 'sentence_count'] = len(list(doc.sents))
            
        except Exception as e:
            print(f"Error processing text at index {idx}: {str(e)}")
            # Keep default values (0) for this document
    
    return features

In [71]:
# Vectorize the text for bag-of-words features
def vectorize_text(train_texts, test_texts):
    # 1. Count Vectorizer (n-grams)
    count_vectorizer = CountVectorizer(
        analyzer='word',
        ngram_range=(1, 2),  # Unigrams and bigrams
        max_features=500,
        stop_words='english'
    )
    
    X_train_counts = count_vectorizer.fit_transform(train_texts)
    X_test_counts = count_vectorizer.transform(test_texts)
    
    # 2. TF-IDF Vectorizer
    tfidf_vectorizer = TfidfVectorizer(
        analyzer='word',
        ngram_range=(1, 2),
        max_features=500,
        stop_words='english'
    )
    
    X_train_tfidf = tfidf_vectorizer.fit_transform(train_texts)
    X_test_tfidf = tfidf_vectorizer.transform(test_texts)
    
    return (X_train_counts, X_test_counts), (X_train_tfidf, X_test_tfidf), count_vectorizer, tfidf_vectorizer

In [72]:
# Plot model performance comparison
def plot_model_comparison(results):
    plt.figure(figsize=(12, 6))
    
    # Bar chart of accuracy for each model
    models = list(results.keys())
    accuracy = [results[model]['accuracy'] for model in models]
    
    plt.bar(models, accuracy, color=['blue', 'green', 'red', 'purple'])
    plt.ylim(0, 1.0)
    plt.xlabel('Models')
    plt.ylabel('Accuracy')
    plt.title('Model Accuracy Comparison')
    
    # Add accuracy values on top of each bar
    for i, v in enumerate(accuracy):
        plt.text(i, v + 0.01, f"{v:.4f}", ha='center')
    
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.close()
    

In [73]:
def plot_feature_importance(feature_names, importances, title='Feature Importance'):
    # Get the top 20 features
    indices = np.argsort(importances)[::-1][:20]
    top_features = [feature_names[i] for i in indices]
    top_importances = importances[indices]
    
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(top_features)), top_importances, align='center')
    plt.yticks(range(len(top_features)), top_features)
    plt.xlabel('Importance')
    plt.title(title)
    plt.tight_layout()
    plt.savefig('feature_importance.png')
    plt.close()

In [74]:
def train_classifier(json_file_path, test_size=0.2):
    # Load the data
    df = load_data(json_file_path)
    
    print(f"Dataset loaded with {len(df)} samples")
    print(f"Class distribution: {df['classification'].value_counts().to_dict()}")
    
    # Check if 'classification' column exists
    if 'classification' not in df.columns:
        raise ValueError("The dataset doesn't have a 'classification' column")
    
    # Check if 'input' column exists
    if 'input' not in df.columns:
        raise ValueError("The dataset doesn't have an 'input' column")
    
    # Split the data into train and test sets - ensure indices are reset
    X_train, X_test, y_train, y_test = train_test_split(
        df['input'].reset_index(drop=True), 
        df['classification'].reset_index(drop=True), 
        test_size=test_size, 
        random_state=42,
        stratify=df['classification'].reset_index(drop=True)
    )
    
    print(f"Training set size: {len(X_train)}")
    print(f"Test set size: {len(X_test)}")
    
    # Verify X_train and X_test indices
    print(f"X_train index range: {X_train.index.min()} to {X_train.index.max()}")
    print(f"X_test index range: {X_test.index.min()} to {X_test.index.max()}")
    
    # Extract engineered features
    print("Extracting features...")
    X_train_features = extract_features(X_train)
    X_test_features = extract_features(X_test)

    #print the features names
    print(f"Feature names: {X_train_features.columns.tolist()}")
    
    # Verify dimensions of engineered features
    print(f"X_train_features shape: {X_train_features.shape}")
    print(f"X_train row count: {len(X_train)}")
    
    # Vectorize the text
    print("Vectorizing text...")
    (X_train_counts, X_test_counts), (X_train_tfidf, X_test_tfidf), count_vectorizer, tfidf_vectorizer = vectorize_text(X_train, X_test)
    
    # Verify dimensions of vectorized text
    print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
    
    # Verify that dimensions match before concatenation
    if len(X_train_features) != X_train_tfidf.shape[0]:
        print(f"WARNING: Dimension mismatch! X_train_features has {len(X_train_features)} rows, but X_train_tfidf has {X_train_tfidf.shape[0]} rows.")
        print("Adjusting X_train_features...")
        # Make sure they have same number of samples
        if len(X_train_features) > X_train_tfidf.shape[0]:
            X_train_features = X_train_features.iloc[:X_train_tfidf.shape[0]].copy()
        else:
            # This would be unusual, but handle it just in case
            X_train_tfidf = X_train_tfidf[:len(X_train_features), :]
            X_train = X_train.iloc[:len(X_train_features)]
            y_train = y_train.iloc[:len(X_train_features)]
    
    # Same check for test data
    if len(X_test_features) != X_test_tfidf.shape[0]:
        print(f"WARNING: Test dimension mismatch! X_test_features has {len(X_test_features)} rows, but X_test_tfidf has {X_test_tfidf.shape[0]} rows.")
        print("Adjusting X_test_features...")
        # Make sure they have same number of samples
        if len(X_test_features) > X_test_tfidf.shape[0]:
            X_test_features = X_test_features.iloc[:X_test_tfidf.shape[0]].copy()
        else:
            X_test_tfidf = X_test_tfidf[:len(X_test_features), :]
            X_test = X_test.iloc[:len(X_test_features)]
            y_test = y_test.iloc[:len(X_test_features)]
    
    # Final verification of dimensions
    print(f"Final X_train_features shape: {X_train_features.shape}")
    print(f"Final X_train_tfidf shape: {X_train_tfidf.shape}")
    print(f"Final y_train length: {len(y_train)}")
    
    # Convert sparse matrices to dense arrays for concatenation with other features
    print("Combining features and vectorized text...")
    X_train_combined = np.hstack([
        X_train_features.values,
        X_train_tfidf.toarray()
    ])
    
    X_test_combined = np.hstack([
        X_test_features.values,
        X_test_tfidf.toarray()
    ])
    
    print(f"X_train_combined shape: {X_train_combined.shape}")
    print(f"X_test_combined shape: {X_test_combined.shape}")
    
    # Train models and evaluate
    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Naive Bayes': MultinomialNB(),
        'SVM': SVC(kernel='linear', probability=True, random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
    }
    
    best_model = None
    best_accuracy = 0
    results = {}
    classes = sorted(list(df['classification'].unique()))
    
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train_combined, y_train)
        
        # Cross-validation score
        cv_scores = cross_val_score(model, X_train_combined, y_train, cv=5)
        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean CV score: {np.mean(cv_scores):.4f}")
        
        # Predictions on test set
        y_pred = model.predict(X_test_combined)
        accuracy = accuracy_score(y_test, y_pred)
        
        print(f"{name} Accuracy: {accuracy:.4f}")
        print(classification_report(y_test, y_pred))
        
        cm = confusion_matrix(y_test, y_pred)
        print("Confusion Matrix:")
        print(cm)
        print("-" * 50)
        
        # Store results
        results[name] = {
            'accuracy': accuracy,
            'confusion_matrix': cm,
            'classes': classes
        }
        
        # For ROC curve and precision-recall curve (if applicable)
        if hasattr(model, "predict_proba"):
            y_prob = model.predict_proba(X_test_combined)
            
            # Store probability predictions
            results[name]['probabilities'] = y_prob
            
            # If binary classification
            if len(classes) == 2:
                # ROC curve
                fpr, tpr, _ = roc_curve(y_test == classes[1], y_prob[:, 1])
                roc_auc = auc(fpr, tpr)
                
                results[name]['fpr'] = fpr
                results[name]['tpr'] = tpr
                results[name]['auc'] = roc_auc
                
                # Precision-Recall curve
                precision, recall, _ = precision_recall_curve(y_test == classes[1], y_prob[:, 1])
                
                results[name]['precision'] = precision
                results[name]['recall'] = recall
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_model = model
    
    best_model_name = [name for name, model in models.items() if model == best_model][0]
    print(f"\nBest model: {best_model_name}")
    print(f"Best accuracy: {best_accuracy:.4f}")

    joblib.dump(tfidf_vectorizer, "vectorizer.pkl")
    print("Vectorizer saved to vectorizer.pkl")
    
    # Feature importance analysis if it's a Random Forest
    if isinstance(best_model, RandomForestClassifier):
        feature_names = list(X_train_features.columns) + list(tfidf_vectorizer.get_feature_names_out())
        importances = best_model.feature_importances_
        
        print("\nTop 20 most important features:")
        indices = np.argsort(importances)[::-1]
        for i in range(min(20, len(feature_names))):
            print(f"{feature_names[indices[i]]}: {importances[indices[i]]:.4f}")
        
        # Plot feature importance
        plot_feature_importance(np.array(feature_names), importances)
    
    return {
        'model': best_model,
        'model_name': best_model_name,
        'accuracy': best_accuracy,
        'feature_extractor': lambda texts: extract_features(pd.Series(texts)),
        'vectorizer': tfidf_vectorizer,
        'feature_names': list(X_train_features.columns) + list(tfidf_vectorizer.get_feature_names_out())
    }


In [75]:
def predict(text, classifier_dict):
    # Extract features
    text_series = pd.Series([text])
    features = extract_features(text_series)
    vectorizer = joblib.load("vectorizer.pkl")
    
    # Vectorize
    text_vectorized = vectorizer.transform([text])

    
    # Combine features
    X_combined = np.hstack([
        features.values,
        text_vectorized.toarray()
    ])
    
    # load model classifier_model
    model = joblib.load("classifier_model.pkl")

    prediction = model.predict(X_combined)[0]

    confidence = None
    if hasattr(model, 'predict_proba'):
        proba = model.predict_proba(X_combined)[0]
        confidence = max(proba)
    
    return {
        'prediction': prediction,
        'confidence': confidence
    }


In [76]:

if __name__ == "__main__":
    # Train the classifier
    json_file_path = 'qa_summarization_dataset.json'
    print(f"Training classifier on {json_file_path}...")
    
    try:
        classifier_dict = train_classifier(json_file_path)

        # save the classifier_dict to a JSON file
        model = classifier_dict.pop("model", None)  # Remove the model from the dictionary
        if model:
            joblib.dump(model, "classifier_model.pkl")  # Save the model to a .pkl file
            print("Model saved to classifier_model.pkl")
   
        print("\nTesting with examples:")
        test_examples = [
            "What is the capital of France?",
            "Can you summarize the key points from this article?",
            "Explain how photosynthesis works",
            "Create a brief summary of the meeting minutes",
            "What are the benefits of regular exercise?",
            "Would you mind creating a digest of these research findings?"
        ]
        
        for example in test_examples:
            prediction = predict(example, classifier_dict)
            print(f"Text: {example}")
            print(f"Prediction: {prediction['prediction']}")
            if prediction['confidence']:
                print(f"Confidence: {prediction['confidence']:.4f}")
            print()
            
    except Exception as e:
        print(f"An error occurred: {e}")

Training classifier on qa_summarization_dataset.json...
Dataset loaded with 505 samples
Class distribution: {'Summarization': 255, 'QA': 250}
Training set size: 404
Test set size: 101
X_train index range: 0 to 504
X_test index range: 1 to 503
Extracting features...
Feature names: ['text_length', 'word_count', 'avg_word_length', 'has_question_mark', 'starts_with_what', 'starts_with_who', 'starts_with_when', 'starts_with_where', 'starts_with_why', 'starts_with_how', 'starts_with_which', 'starts_with_whose', 'starts_with_question_word', 'starts_with_can', 'starts_with_could', 'starts_with_would', 'starts_with_will', 'starts_with_should', 'starts_with_is', 'starts_with_are', 'starts_with_do', 'starts_with_does', 'starts_with_did', 'starts_with_question_verb', 'contains_summarization_word', 'VERB', 'NOUN', 'ADJ', 'ADV', 'PRON', 'DET', 'ADP', 'NUM', 'first_token_is_verb', 'has_imperative', 'sentence_count']
X_train_features shape: (404, 36)
X_train row count: 404
Vectorizing text...
X_train_

In [77]:
# write an example and pridict it is class 
example = "the recent news of today"
prediction = predict(example, classifier_dict) 
print(f"Text: {example}")
print(f"Prediction: {prediction['prediction']}")   

Text: the recent news of today
Prediction: Summarization
