<a href="https://colab.research.google.com/github/bhagyaborade/my_projects/blob/multi-label_text_classification/multi_label_text_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import xgboost as xgb
import nltk
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Download stopwords if needed
nltk.download('stopwords')

# Read data
# df = main_data_df.copy()


# Generate data for 10000 rows
# Example data
descriptions = [
    'Travel expenses for business meeting',
                 'Monthly utility bill',
            'Employee training program',
              'Maintenance and repairs',
            'Purchase of raw materials',
                      'Office supplies',
       'Research and development costs',
            'IT infrastructure upgrade',
          'Marketing campaign expenses',
                  'Consulting services'
]

labels = [
    'Raw Materials', 'Operational Costs', 'R&D', 'Miscellaneous', 'Marketing'
]

data = {
    'TransactionID': range(1, 10001),
    'Date': pd.date_range(start='1/1/2023', periods=10000),
    'Amount': np.random.randint(1, 1000, size=10000),
    'Description': np.random.choice(descriptions, size=10000),
    'Category': np.random.choice(labels, size=10000)
}

df = pd.DataFrame(data)


df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

# Feature Engineering
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

# Encode labels
label_encoder = LabelEncoder()
df['Category'] = label_encoder.fit_transform(df['Category'])

# Text preprocessing
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    tokens = text.lower().split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['Description'] = df['Description'].apply(preprocess_text)

# Vectorize text data
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(df['Description'])

# Concatenate all features
X = np.hstack((X_text.toarray(), df[['Amount', 'Year', 'Month', 'Day']].values))

# Scale the numerical features
scaler = StandardScaler()
X[:, -4:] = scaler.fit_transform(X[:, -4:])

# Apply SMOTE to handle class imbalance
smote = SMOTE(random_state=1)
X_resampled, y_resampled = smote.fit_resample(X, df['Category'])

# Labels
y = y_resampled

# Define base models
base_models = [
    ('logreg', LogisticRegression(max_iter=1000)),
    ('svc', SVC(kernel='linear', probability=True)),
    ('rf', RandomForestClassifier()),
    ('gb', GradientBoostingClassifier())
]

# Define meta-model for stacking
meta_model = LogisticRegression(max_iter=1000)

# Create StackingClassifier
stacked_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# Cross-validation setup
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

# Evaluate models
results = {}
per_class_accuracies = {}

models = base_models + [('stacked', stacked_model)]
for name, model in models:
    if name in ['logreg', 'svc', 'rf']:
        param_grid = {}
        if name == 'logreg':
            param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
        elif name == 'svc':
            param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
        elif name == 'rf':
            param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]}

        grid_search = GridSearchCV(model, param_grid, cv=kf, scoring='accuracy')
        grid_search.fit(X_resampled, y_resampled)
        model = grid_search.best_estimator_
        print(f"Best parameters for {name}: {grid_search.best_params_}")

    # Get cross-validated predictions
    y_pred = cross_val_predict(model, X_resampled, y_resampled, cv=kf)

    # Calculate overall accuracy
    overall_accuracy = accuracy_score(y_resampled, y_pred)

    # Calculate per-class accuracy
    class_report = classification_report(y_resampled, y_pred, output_dict=True)
    per_class_accuracy = {label_encoder.inverse_transform([int(k)])[0]: v['precision'] for k, v in class_report.items() if k.isdigit()}

    results[name] = overall_accuracy
    per_class_accuracies[name] = per_class_accuracy

    print(f"{name} - Overall Accuracy: {overall_accuracy:.4f}")
    for label, acc in per_class_accuracy.items():
        print(f"  {label} Accuracy: {acc:.4f}")

# Plot overall accuracy results
plt.figure(figsize=(10, 6))
plt.bar(results.keys(), results.values())
plt.title('Model Comparison - Overall Accuracy')
plt.ylabel('Accuracy')
plt.show()

# Plot per-class accuracy for the best model
best_model_name = max(results, key=results.get)
best_per_class_accuracy = per_class_accuracies[best_model_name]

plt.figure(figsize=(10, 6))
plt.bar(best_per_class_accuracy.keys(), best_per_class_accuracy.values())
plt.title(f'Per-Class Accuracy - {best_model_name}')
plt.ylabel('Accuracy')
plt.show()

print(f"Best Model: {best_model_name} with accuracy of {results[best_model_name]:.4f}")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Best parameters for logreg: {'C': 10}
logreg - Overall Accuracy: 0.1968
  Marketing Accuracy: 0.1998
  Miscellaneous Accuracy: 0.1932
  Operational Costs Accuracy: 0.2003
  R&D Accuracy: 0.1967
  Raw Materials Accuracy: 0.1931
