# Objective : Web Services  Classification 
## Our goal is to build a baseline model with at least 80% accuracy

## 1. Load Python Modules

In [None]:
# Basic Libraries
import re
import os
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Text Preprocessing
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Feature Extraction
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer

# Encoding and Scaling
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Model Selection & Evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils import class_weight

# ML Models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from tabulate import tabulate
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# Ignore warnings
warnings.filterwarnings('ignore')
sns.set(style="whitegrid")
%matplotlib inline

# Setup directories
os.makedirs("charts", exist_ok=True)
os.makedirs("results", exist_ok=True)


## 2. Reading the Web Services dataset

In [None]:
input_csv = f"../data/Balanced_Top_50_Web_Services.csv"
df = pd.read_csv(input_csv)
print(df)
print(df["Grouped Category"].nunique())
print(df["Grouped Category"].value_counts())

## 3. Basic Inspection on given dataset

In [None]:
def basic_inspection_dataset(table):
    print("Top 5 Records of dataset")
    print(table.head())
    print()
        
    print("Bottom 5 Records of dataset")
    print(table.tail())
    print()
    
    print("Column/features/Variable  - Names of Given dataset")
    print(table.columns)
    print()
    
    print("Shape(rows x columns) - of Given dataset")
    print(table.shape)
    print()
    
    print("Data types - Given Column Names")
    print(table.dtypes)
    print()
    
    print("Summry of dataset")
    print(table.info())
    print()
    
    print("To see the count of null/nan values in columns of dataset")
    print(table.isnull().value_counts())
    print()
    
    print("Dataset Summary ")
    print(table.describe())
    print()
    
basic_inspection_dataset(df)

## 4. Handling Missing Values

In [None]:
df.isnull().sum()

## 5. Categorical- UniVariable - Analysis 

In [None]:

class BarPieChartTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df=X.copy()
        # get cat columns 
        cat_cols = df.select_dtypes(include='object').columns
        for cat_name in cat_cols:
            value_counts = df[cat_name].value_counts().reset_index()
            # Rename the columns
            value_counts.columns = ['Class', 'Frequency']

            # Print the result as a table
            print(f"{cat_name} frequency table")
            print(tabulate(value_counts, headers='keys', tablefmt='pretty'))

            # Calculate relative frequency
            total_count = value_counts['Frequency'].sum()
            value_counts['Relative Frequency %'] = round((value_counts['Frequency'] / total_count)*100,2)

            # Print the result as a table
            print(f"{cat_name} Relative frequency table")
            print(tabulate(value_counts, headers='keys', tablefmt='pretty'))

            # Extract the values and index from value counts
            value_counts = df[cat_name].value_counts()
            values = value_counts.values
            labels = value_counts.index

            fig, axs = plt.subplots(1, 2, figsize=(12, 6))  # 1 row, 2 columns
            # Create a bar graph
            axs[0].bar(labels, values)
            axs[0].set_title(f'Frequency of {cat_name}')
            axs[0].set_xlabel('Categories')  # Set x-label
            axs[0].set_ylabel('Count')       # Set y-label

            axs[1].pie(value_counts.values, labels=value_counts.index, autopct='%1.1f%%', startangle=140)
            axs[1].set_title(f'Relative Frequency of {cat_name}')
            plt.tight_layout()
            # Show the plot
            plt.show()  
            
pipeline_cat_var = Pipeline([
    ('cat_univaraite_analysis', BarPieChartTransformer())
])

# Fit and transform your data using the pipeline
processed_data = pipeline_cat_var.fit_transform(df[["Service Classification", "Grouped Category"]])

## 6. Preprocessing Web Service Descriptions

In [None]:
# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in stemmed_tokens]

    return " ".join(lemmatized_tokens)


df['Processed Description'] = df['Service Description'].apply(preprocess_text)


## 7. Feature Extraction from Web Service Descriptions using TF-IDF and Sentence Transformers (all-MiniLM-L6-v2)

In [None]:
def vectorize_descriptions(service_list, max_features=1000):
    """
    Converts a list of service descriptions into a DataFrame of TF-IDF features.
    Limits to top `max_features` terms by importance across the corpus.
    """
    vectorizer = TfidfVectorizer(max_features=max_features)
    tfidf_matrix = vectorizer.fit_transform(service_list)
    feature_names = vectorizer.get_feature_names_out()
    
    return pd.DataFrame(tfidf_matrix.toarray(), columns=feature_names)


def embed_descriptions_with_sbert(services, model_name='all-MiniLM-L6-v2'):
    """
    Converts service descriptions into SBERT embeddings using SentenceTransformer.
    """
    model = SentenceTransformer(model_name)
    embeddings = model.encode(services, show_progress_bar=True)
    
    return pd.DataFrame(embeddings, index=services.index if isinstance(services, pd.Series) else None)


In [None]:
n=50

tfidf_df = vectorize_descriptions(df['Processed Description'])
tfidf_output_csv = f"Processed_Top_{n}_Web_Services_TFIDF.csv"
tfidf_df.to_csv(tfidf_output_csv, encoding='utf-8', index=False, header=True)
print(f"TF-IDF features saved to: {tfidf_output_csv}")

embedding_df = embed_descriptions_with_sbert(df['Processed Description'])
embedding_output_csv = f"Processed_Top_{n}_Web_Services_SBERT_Embeddings.csv"
embedding_df.to_csv(embedding_output_csv, encoding='utf-8', index=False, header=True)
print(f"SBERT embeddings saved to: {embedding_output_csv}")


## 8. Models

### 8.1 TF-IDF + SBERT


## 9. Web Service Classification using LogReg, Random Forest, XGBoost

In [None]:
ml_dl_bert_model_results = []

In [None]:
def train_and_report_logistic(X_train, X_test, y_train, y_test, name="Model"):
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
    print("\nConfusion Matrix:\n")

    # Heatmap
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=np.unique(y_test), 
                yticklabels=np.unique(y_test))
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f"charts/ml_confusion_matrix_{name.replace(' ', '_')}.png")
    plt.show()
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    return {
        "model": f"LogReg",
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }


def train_and_report_random_forest(X_train, X_test, y_train, y_test, name="Model"):
    # Compute class weights
    classes = np.unique(y_train)
    weights = class_weight.compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights_dict = dict(zip(classes, weights))

    # Train model with class weights
    model = RandomForestClassifier(n_estimators=200, class_weight=class_weights_dict, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    print(f"\n {name} Classification Report:\n")
    print(classification_report(y_test, y_pred))
    
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
    print("\nConfusion Matrix:\n")
    # Heatmap
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=np.unique(y_test), 
                yticklabels=np.unique(y_test))
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f"charts/ml_confusion_matrix_{name.replace(' ', '_')}.png")
    plt.show()
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    return {
        "model": f"RF",
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }



label_encoder = LabelEncoder()

def train_and_report_xgboost(X_train, X_test, y_train_text, y_test_text, name="Model"):
    # Encode class labels
    y_train = label_encoder.fit_transform(y_train_text)
    y_test = label_encoder.transform(y_test_text)

    # Compute class weights
    classes = np.unique(y_train)
    weights = class_weight.compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
    class_weights_dict = dict(zip(classes, weights))
    sample_weights = np.array([class_weights_dict[label] for label in y_train])

    # Train model
    model = XGBClassifier(n_estimators=200, random_state=42, use_label_encoder=False, eval_metric='mlogloss')
    model.fit(X_train, y_train, sample_weight=sample_weights)
    y_pred = model.predict(X_test)

    # Decode labels for reporting
    y_test_labels = label_encoder.inverse_transform(y_test)
    y_pred_labels = label_encoder.inverse_transform(y_pred)

    print(f"\n{name} Classification Report:\n")
    print(classification_report(y_test_labels, y_pred_labels))
    cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
    print("\nConfusion Matrix:\n")
    # Heatmap
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", 
                xticklabels=np.unique(y_test), 
                yticklabels=np.unique(y_test))
    plt.title(f"Confusion Matrix: {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.xticks(rotation=90)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f"charts/ml_confusion_matrix_{name.replace(' ', '_')}.png")
    plt.show()
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)
    return {
        "model": f"XGB",
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    }


for n in [50]:

    tfidf_df = pd.read_csv(f"Processed_Top_{n}_Web_Services_TFIDF.csv")
    sbert_df = pd.read_csv(f"Processed_Top_{n}_Web_Services_SBERT_Embeddings.csv")
    original_df = pd.read_csv(f"../data/Balanced_Top_{n}_Web_Services.csv")
    labels = original_df['Grouped Category'].fillna("Unknown")

    combined_df = pd.concat([tfidf_df, sbert_df], axis=1)


    X_train, X_test, y_train, y_test = train_test_split(combined_df, labels, test_size=0.2, random_state=42)


    result=train_and_report_logistic(X_train, X_test, y_train, y_test, name=f"LogReg TFIDF + SBERT with {n} Web Services")
    ml_dl_bert_model_results.append(result)
    
    result=train_and_report_random_forest(X_train, X_test, y_train, y_test, name=f"RF TFIDF + SBERT with {n} Web Services")
    ml_dl_bert_model_results.append(result)
    
    result=train_and_report_xgboost(X_train, X_test, y_train, y_test, name=f"XGB TFIDF + SBERT with {n} Web Services")
    ml_dl_bert_model_results.append(result)
    


## 10.  Web Service Classification using DL  Model:  BI-LSTM

In [None]:
def clean_text(text):
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    return text


def build_dl_model(input_length, num_classes, model_type="rnn"):
    model = Sequential()
    model.add(Embedding(input_dim=10000, output_dim=128, input_length=input_length))

    model.add(Bidirectional(LSTM(128, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(64)))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.4))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_dl_model(df, model_type, name, max_len=150):
    df = df.dropna(subset=["Service Description", "Grouped Category"])
    df["Service Description"] = df["Service Description"].apply(clean_text)

    texts = df['Service Description'].values
    labels = df['Grouped Category'].values

    tokenizer = Tokenizer(num_words=10000)
    tokenizer.fit_on_texts(texts)
    X_seq = tokenizer.texts_to_sequences(texts)
    X_pad = pad_sequences(X_seq, maxlen=max_len, padding='post')

    le = LabelEncoder()
    y_enc = le.fit_transform(labels)
    y_cat = to_categorical(y_enc)
    num_classes = y_cat.shape[1]

    X_train, X_test, y_train, y_test = train_test_split(X_pad, y_cat, test_size=0.2, random_state=42, stratify=y_cat)

    class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_enc), y=y_enc)
    class_weights = dict(enumerate(class_weights))

    model = build_dl_model(input_length=max_len, num_classes=num_classes, model_type=model_type)

    # Train and capture history
    history = model.fit(
        X_train, y_train,
        epochs=30,
        batch_size=32,
        validation_split=0.2,
        class_weight=class_weights,
        verbose=1
    )

    # Plotting training history
    plt.figure(figsize=(14, 6))

    # Loss plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Val Loss')
    plt.title('Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.grid(True)
    plt.legend()

    # Accuracy plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Val Accuracy')
    plt.title('Accuracy over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.grid(True)
    plt.legend()

    plt.tight_layout()
    plt.show()

    y_pred = np.argmax(model.predict(X_test), axis=1)
    y_true = np.argmax(y_test, axis=1)
    class_names = le.inverse_transform(np.arange(num_classes))

    # Confusion matrix
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=class_names, yticklabels=class_names)
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f"charts/dl_confusion_matrix_{name.replace(' ', '_')}.png")
    plt.show()

    # Classification report
    print("\nClassification Report:\n")
    print(classification_report(y_true, y_pred, target_names=class_names))

    return {
        "model": name,
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred, average='weighted', zero_division=0),
        "recall": recall_score(y_true, y_pred, average='weighted', zero_division=0),
        "f1_score": f1_score(y_true, y_pred, average='weighted', zero_division=0)
    }


In [None]:
n=50
df = pd.read_csv(f"../data/Balanced_Top_{n}_Web_Services.csv")
#labels = original_df['Grouped Category'].fillna("Unknown")
name="BI-LSTM"
result = train_dl_model(df, model_type, name)
ml_dl_bert_model_results.append(result)

## 12. Summary

In [None]:
print(ml_dl_bert_model_results)
# Save all results
with open("results/ml_dl_bert_model_results.pkl", "wb") as f:
    pickle.dump(ml_dl_bert_model_results, f)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Convert to DataFrame
df = pd.DataFrame(ml_dl_bert_model_results)

# Set plot style
plt.figure(figsize=(10, 6))
metrics = ['accuracy', 'precision', 'recall', 'f1_score']
bar_width = 0.2
x = range(len(df))

# Plot each metric
for i, metric in enumerate(metrics):
    plt.bar([p + bar_width*i for p in x], df[metric], width=bar_width, label=metric.capitalize())

# Labeling
plt.xticks([p + bar_width for p in x], df['model'])
plt.ylabel('Score')
plt.ylim(0.7, 0.8)
plt.title('Model Comparison on 50 Web Services')
plt.legend()
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
plt.show()
