In [1]:
!pip install pandas numpy scikit-learn matplotlib seaborn joblib




In [2]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import hamming_loss, f1_score, precision_score
from sklearn.pipeline import Pipeline
import pickle
import os
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Cell 2: Load and Preprocess Data
import pandas as pd

def load_and_preprocess_data(file_path):
    """Load data from CSV and preprocess it for modeling"""
    print(f"Loading data from {file_path}...")

    # Load the dataset
    df = pd.read_csv(file_path)

    # Print basic statistics
    print(f"Dataset shape: {df.shape}")
    print("\nFirst few rows:")
    print(df.head())

    # Check for missing values
    missing_values = df.isnull().sum()
    print("\nMissing values per column:")
    print(missing_values)

    # Separate features and target labels
    X = df['report']  # Text feature
    label_columns = [col for col in df.columns if col.startswith('type_')]
    y = df[label_columns]

    # Display label distribution
    print("\nLabel distribution:")
    for col in label_columns:
        positive_count = y[col].sum()
        total_count = len(y)
        print(f"{col}: {positive_count} positive ({positive_count/total_count:.2%})")

    return X, y, label_columns

# Call the function using your corrected file path
file_path = '/content/dataset.csv'
X, y, label_columns = load_and_preprocess_data(file_path)


Loading data from /content/dataset.csv...
Dataset shape: (1386, 8)

First few rows:
                                              report  type_blocker  \
0  The mention of Fix Super Stream Example in Doc...             0   
1  It seems like you need a concise summary relat...             0   
2  The issue AMQP 838 opened by Gary Russell invo...             0   
3  I m unable to access external content directly...             0   
4  In the discussion around AMQP 815 https jira s...             0   

   type_regression  type_bug  type_documentation  type_enhancement  type_task  \
0                0         1                   1                 0          0   
1                0         1                   1                 0          0   
2                0         1                   1                 0          0   
3                0         1                   1                 0          0   
4                0         1                   1                 0          0   

   type_

In [11]:
# Cell 3: Feature Engineering
def create_text_features(X_train, X_test=None):
    """Convert text data to numerical features using TF-IDF"""
    # Create TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(
        max_features=5000,  # Limit features to reduce dimensionality
        min_df=2,           # Minimum document frequency
        max_df=0.95,        # Maximum document frequency
        stop_words='english',
        ngram_range=(1, 2)  # Use unigrams and bigrams
    )

    # Transform training data
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

    # Transform test data if provided
    if X_test is not None:
        X_test_tfidf = tfidf_vectorizer.transform(X_test)
        return X_train_tfidf, X_test_tfidf, tfidf_vectorizer

    return X_train_tfidf, tfidf_vectorizer

In [12]:
# Cell 4: Model Building Functions
def train_logistic_regression(X_train, y_train):
    """Train a Logistic Regression model for multi-label classification"""
    print("Training Logistic Regression model...")

    # Create base model
    lr = LogisticRegression(C=1.0, solver='liblinear', max_iter=1000)

    # Wrap it for multi-label classification
    lr_model = MultiOutputClassifier(lr)

    # Train the model
    lr_model.fit(X_train, y_train)

    return lr_model

def train_svm(X_train, y_train):
    """Train an SVM model for multi-label classification"""
    print("Training SVM model...")

    # Create base model
    svm = LinearSVC(C=1.0, max_iter=2000)

    # Wrap it for multi-label classification
    svm_model = MultiOutputClassifier(svm)

    # Train the model
    svm_model.fit(X_train, y_train)

    return svm_model

def train_perceptron(X_train, y_train, online_learning=False):
    """Train a Perceptron model for multi-label classification"""
    print("Training Perceptron model...")

    # Create base model
    if online_learning:
        # For online learning mode
        perceptron = Perceptron(alpha=0.0001, max_iter=1000, warm_start=True)
    else:
        perceptron = Perceptron(alpha=0.0001, max_iter=1000)

    # Wrap it for multi-label classification
    perceptron_model = MultiOutputClassifier(perceptron)

    # Train the model
    perceptron_model.fit(X_train, y_train)

    # For online learning mode, provide a function to update the model
    if online_learning:
        def update_model(X_new, y_new):
            for i in range(len(perceptron_model.estimators_)):
                perceptron_model.estimators_[i].partial_fit(X_new, y_new[:, i], classes=np.array([0, 1]))
        return perceptron_model, update_model

    return perceptron_model

def train_dnn(X_train, y_train):
    """Train a Deep Neural Network for multi-label classification"""
    print("Training Deep Neural Network model...")

    # Create base model
    # Adjust hidden_layer_sizes based on data complexity
    input_size = X_train.shape[1]
    hidden_layer_sizes = (min(200, input_size), min(100, input_size//2))

    dnn = MLPClassifier(
        hidden_layer_sizes=hidden_layer_sizes,
        activation='relu',
        solver='adam',
        alpha=0.0001,
        batch_size='auto',
        learning_rate='adaptive',
        max_iter=300,
        early_stopping=True,
        validation_fraction=0.1,
        verbose=True
    )

    # Wrap it for multi-label classification
    dnn_model = MultiOutputClassifier(dnn)

    # Train the model
    dnn_model.fit(X_train, y_train)

    return dnn_model


In [13]:
# Cell 5: Model Evaluation Functions
def evaluate_model(model, X_test, y_test, label_columns, model_name="Model"):
    """Evaluate model using multi-label classification metrics"""
    # Make predictions
    y_pred = model.predict(X_test)

    # Calculate metrics
    hl = hamming_loss(y_test, y_pred)
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    macro_f1 = f1_score(y_test, y_pred, average='macro')

    # Get probabilities if available (not all models provide predict_proba)
    try:
        y_prob = model.predict_proba(X_test)
        has_probabilities = True
    except:
        has_probabilities = False

    print(f"\n----- {model_name} Evaluation -----")
    print(f"Hamming Loss: {hl:.4f}")
    print(f"Micro-F1 Score: {micro_f1:.4f}")
    print(f"Macro-F1 Score: {macro_f1:.4f}")

    # Evaluate per-class performance
    print("\nPer-class performance:")
    for i, label in enumerate(label_columns):
        class_f1 = f1_score(y_test[:, i], y_pred[:, i], average='binary')
        class_precision = precision_score(y_test[:, i], y_pred[:, i], average='binary', zero_division=0)
        print(f"{label}: F1={class_f1:.4f}, Precision={class_precision:.4f}")

    # Return metrics for comparison
    return {
        'model_name': model_name,
        'hamming_loss': hl,
        'micro_f1': micro_f1,
        'macro_f1': macro_f1,
        'has_probabilities': has_probabilities,
        'y_pred': y_pred
    }

In [14]:
# Cell 6: Complete Training Pipeline
def train_and_evaluate_all_models(file_path, test_size=0.2, random_state=42):
    """Complete pipeline to train and evaluate all models"""
    # Load and preprocess data
    X, y, label_columns = load_and_preprocess_data(file_path)

    # Remove columns with only one unique value
    # Get columns with only one unique value in the target labels
    single_value_cols = [col for col in y.columns if y[col].nunique() < 2]
    # Drop columns with only one unique value
    y = y.drop(columns=single_value_cols)
    # Update label_columns
    label_columns = [col for col in label_columns if col not in single_value_cols]

    # Split into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y.iloc[:, 0] if y.shape[1] > 0 else None
    )

    # Create features
    X_train_features, X_test_features, vectorizer = create_text_features(X_train, X_test)

    # Train models
    lr_model = train_logistic_regression(X_train_features, y_train)
    svm_model = train_svm(X_train_features, y_train)
    perceptron_model = train_perceptron(X_train_features, y_train)
    dnn_model = train_dnn(X_train_features, y_train)

    # Train online perceptron
    online_perceptron_model, update_func = train_perceptron(X_train_features, y_train, online_learning=True)

    # Evaluate models
    results = []
    results.append(evaluate_model(lr_model, X_test_features, y_test.values, label_columns, "Logistic Regression"))
    results.append(evaluate_model(svm_model, X_test_features, y_test.values, label_columns, "SVM"))
    results.append(evaluate_model(perceptron_model, X_test_features, y_test.values, label_columns, "Perceptron"))
    results.append(evaluate_model(online_perceptron_model, X_test_features, y_test.values, label_columns, "Online Perceptron"))
    results.append(evaluate_model(dnn_model, X_test_features, y_test.values, label_columns, "Deep Neural Network"))
    # Compare models
    print("\n----- Model Comparison -----")
    comparison_df = pd.DataFrame([
        {
            'Model': r['model_name'],
            'Hamming Loss': r['hamming_loss'],
            'Micro-F1': r['micro_f1'],
            'Macro-F1': r['macro_f1']
        }
        for r in results
    ])
    print(comparison_df)

    # Save models and preprocessing components
    save_models(lr_model, svm_model, perceptron_model, online_perceptron_model, dnn_model, vectorizer, label_columns)

    return vectorizer, label_columns, (lr_model, svm_model, perceptron_model, online_perceptron_model, dnn_model)

In [15]:
# Cell 7: Save and Load Models
def save_models(lr_model, svm_model, perceptron_model, online_perceptron_model, dnn_model, vectorizer, label_columns):
    """Save trained models and preprocessing components"""
    # Create models directory if it doesn't exist
    if not os.path.exists('models'):
        os.makedirs('models')

    # Save models
    joblib.dump(lr_model, 'models/lr_model.pkl')
    joblib.dump(svm_model, 'models/svm_model.pkl')
    joblib.dump(perceptron_model, 'models/perceptron_model.pkl')
    joblib.dump(online_perceptron_model, 'models/online_perceptron_model.pkl')
    joblib.dump(dnn_model, 'models/dnn_model.pkl')

    # Save vectorizer
    joblib.dump(vectorizer, 'models/vectorizer.pkl')

    # Save label names
    with open('models/label_columns.pkl', 'wb') as f:
        pickle.dump(label_columns, f)

    print("Models saved successfully.")

def load_models():
    """Load saved models and preprocessing components"""
    # Load models
    lr_model = joblib.load('models/lr_model.pkl')
    svm_model = joblib.load('models/svm_model.pkl')
    perceptron_model = joblib.load('models/perceptron_model.pkl')
    online_perceptron_model = joblib.load('models/online_perceptron_model.pkl')
    dnn_model = joblib.load('models/dnn_model.pkl')

    # Load vectorizer
    vectorizer = joblib.load('models/vectorizer.pkl')

    # Load label names
    with open('models/label_columns.pkl', 'rb') as f:
        label_columns = pickle.load(f)

    return lr_model, svm_model, perceptron_model, online_perceptron_model, dnn_model, vectorizer, label_columns


In [17]:
# Cell 8: Main Execution
if __name__ == "__main__":
    # Train and evaluate models
    file_path = "/content/dataset.csv"  # Update with actual path
    vectorizer, label_columns, models = train_and_evaluate_all_models(file_path)

    print("\nTraining and evaluation complete!")
    print("Run the Streamlit app to make predictions.")

Loading data from /content/dataset.csv...
Dataset shape: (1386, 8)

First few rows:
                                              report  type_blocker  \
0  The mention of Fix Super Stream Example in Doc...             0   
1  It seems like you need a concise summary relat...             0   
2  The issue AMQP 838 opened by Gary Russell invo...             0   
3  I m unable to access external content directly...             0   
4  In the discussion around AMQP 815 https jira s...             0   

   type_regression  type_bug  type_documentation  type_enhancement  type_task  \
0                0         1                   1                 0          0   
1                0         1                   1                 0          0   
2                0         1                   1                 0          0   
3                0         1                   1                 0          0   
4                0         1                   1                 0          0   

   type_

In [18]:
!pip install gradio scikit-learn pandas numpy matplotlib seaborn joblib


Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [21]:
!pip install gradio pandas numpy joblib plotly scikit-learn




In [23]:
import gradio as gr
import pandas as pd
import numpy as np
import joblib
import plotly.graph_objects as go
from sklearn.feature_extraction.text import TfidfVectorizer

# Load models and preprocessing components
def load_models():
    try:
        lr_model         = joblib.load('/content/models/lr_model.pkl')
        svm_model        = joblib.load('/content/models/svm_model.pkl')
        dnn_model        = joblib.load('/content/models/dnn_model.pkl')
        vectorizer       = joblib.load('/content/models/vectorizer.pkl')
        label_columns    = joblib.load('/content/models/label_columns.pkl')
        return lr_model, svm_model, dnn_model, vectorizer, label_columns
    except Exception as e:
        raise Exception(f"Error loading models: {e}")

# Predict defect types
def predict_defect_types(text, model, vectorizer, label_columns):
    X = vectorizer.transform([text])
    y_pred = model.predict(X)
    try:
        y_prob = model.predict_proba(X)[0]
    except AttributeError:
        # fallback if no predict_proba
        y_prob = y_pred[0]
    results = []
    for i, lbl in enumerate(label_columns):
        results.append({
            'label': lbl.replace('type_', ''),
            'prediction': int(y_pred[0][i]),
            'probability': float(y_prob[i])
        })
    return results

# Confidence plot
def create_confidence_plot(results):
    rs = sorted(results, key=lambda x: x['probability'])
    labels = [r['label'] for r in rs]
    probs  = [r['probability'] for r in rs]
    colors = ['green' if r['prediction']==1 else 'gray' for r in rs]
    fig = go.Figure(go.Bar(
        x=probs, y=labels, orientation='h',
        marker_color=colors, text=[f"{p:.1%}" for p in probs], textposition='auto'
    ))
    fig.add_shape(type='line', x0=0.5, x1=0.5, y0=-0.5, y1=len(labels)-0.5,
                  line=dict(color='red', dash='dash'))
    fig.update_layout(title='Defect Type Confidence Scores',
                      xaxis_title='Confidence', xaxis=dict(range=[0,1]),
                      height=400)
    return fig

# HTML table
def create_results_table(results):
    df = pd.DataFrame(sorted(results, key=lambda x: x['probability'], reverse=True))
    df = df.rename(columns={'label':'Defect Type','prediction':'Predicted','probability':'Confidence'})
    df['Confidence'] = df['Confidence'].map(lambda x: f"{x:.1%}")
    return df.to_html(index=False)

# Main predict wrapper
def make_prediction(report_text, model_choice):
    try:
        lr, svm, dnn, vec, lbls = load_models()
    except Exception as e:
        return f"⚠️ {e}", None, None
    model_map = {
        "Logistic Regression": lr,
        "SVM": svm,
        "Deep Neural Network": dnn
    }
    model = model_map[model_choice]
    if not report_text:
        return "Please enter a report.", None, None
    results = predict_defect_types(report_text, model, vec, lbls)
    preds = [r['label'] for r in results if r['prediction']==1]
    msg = "✅ Predicted: " + (", ".join(preds) if preds else "None")
    plot = create_confidence_plot(results)
    table = create_results_table(results)
    return msg, plot, table

# Build Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## Zainab's' Software Defect Prediction")
    with gr.Row():
        with gr.Column(scale=2):
            model_choice = gr.Dropdown(
                ["Logistic Regression","SVM","Deep Neural Network"],
                value="Logistic Regression",
                label="Model"
            )
            report_text = gr.Textbox(
                lines=5, placeholder="Enter your defect report here…",
                label="Software Defect Report"
            )
            btn = gr.Button("Analyze", variant="primary")
        with gr.Column(scale=3):
            out_msg   = gr.Markdown()
            out_plot  = gr.Plot()
            out_table = gr.HTML()
    btn.click(make_prediction,
              inputs=[report_text, model_choice],
              outputs=[out_msg, out_plot, out_table])

# share=True gives you a public URL in Colab
demo.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://e4f2ffa9093c7a6dc3.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


