In [None]:
!pip install xgboost lightgbm scikit-learn pandas joblib imbalanced-learn


In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
import lightgbm as lgb
from sklearn.metrics import classification_report
import joblib
from google.colab import files
from imblearn.over_sampling import SMOTE
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from scipy.stats import uniform, randint
import warnings
# Suppress specific warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='xgboost')
warnings.filterwarnings(action='ignore', category=UserWarning, module='lightgbm')

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
def load_data() -> pd.DataFrame:
    """Load usernames and comments data from CSV files."""
    df = pd.read_csv('training_data.csv')
    return df

In [7]:
train_df = load_data()

In [8]:
def preprocess_text(text:str):
    # Convert to lowercase
    text = text.lower()

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text)

    # Remove special characters and digits
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', ' ', text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)


In [9]:

# Load and preprocess the dataset
def load_and_preprocess_data(df:pd.DataFrame):
    label_encoder = LabelEncoder()
    df['label'] = label_encoder.fit_transform(df['label'])

    # Preprocess text data
    df['comments'] = df['comments'].apply(preprocess_text)

    return df, label_encoder


In [10]:
df, label_encoder = load_and_preprocess_data(train_df)

In [None]:
# Vectorize text data
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df['comments'])
y = df['label']

# Apply SMOTE, we noticed that samples for the doctor class was extremely small
# with time, i would have used data augumentation
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


In [12]:
# Define parameter distributions for RandomizedSearchCV
xgb_param_dist = {
    'n_estimators': randint(100, 300),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma': uniform(0, 0.3),
    'scale_pos_weight': uniform(1, 3)
}

lgb_param_dist = {
    'num_leaves': randint(31, 128),
    'learning_rate': uniform(0.01, 0.2),
    'n_estimators': randint(100, 300),
    'max_depth': randint(3, 10),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'scale_pos_weight': uniform(1, 3)
}

In [13]:
# Perform Randomized Search for XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_param_dist, n_iter=25, scoring='f1_macro', cv=3, verbose=1, random_state=42, n_jobs=-1)
xgb_random_search.fit(X_train, y_train)
print(f"Best parameters for XGBoost: {xgb_random_search.best_params_}")

Fitting 3 folds for each of 25 candidates, totalling 75 fits
Best parameters for XGBoost: {'colsample_bytree': 0.610167650697638, 'gamma': 0.032367428097991334, 'learning_rate': 0.01628583713734685, 'max_depth': 9, 'min_child_weight': 1, 'n_estimators': 151, 'scale_pos_weight': 2.689826715929151, 'subsample': 0.878206434570451}


In [14]:
# Perform Randomized Search for LightGBM
lgb_model = lgb.LGBMClassifier()
lgb_random_search = RandomizedSearchCV(estimator=lgb_model, param_distributions=lgb_param_dist, n_iter=25, scoring='f1_macro', cv=3, verbose=1, random_state=42, n_jobs=-1)
lgb_random_search.fit(X_train, y_train)
print(f"Best parameters for LightGBM: {lgb_random_search.best_params_}")

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.080356 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78818
[LightGBM] [Info] Number of data points in the train set: 4099, number of used features: 2257
[LightGBM] [Info] Start training from score -1.094474
[LightGBM] [Info] Start training from score -1.107680
[LightGBM] [Info] Start training from score -1.093745
Best parameters for LightGBM: {'colsample_bytree': 0.6888431241882921, 'learning_rate': 0.033973073466736566, 'max_depth': 6, 'min_child_weight': 8, 'n_estimators': 158, 'num_leaves': 62, 'scale_pos_weight': 2.556371865230098, 'subsample': 0.881207583558071}


In [15]:
# Train and evaluate the best XGBoost model with early stopping
best_xgb_model = xgb.XGBClassifier(**xgb_random_search.best_params_, use_label_encoder=False, eval_metric='mlogloss')
best_xgb_model.fit(
    X_train, y_train,
    early_stopping_rounds=10,
    eval_set=[(X_test, y_test)],
    verbose=True
)
y_pred_xgb = best_xgb_model.predict(X_test)
print("XGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=label_encoder.classes_))


[0]	validation_0-mlogloss:1.08205
[1]	validation_0-mlogloss:1.06561
[2]	validation_0-mlogloss:1.05016
[3]	validation_0-mlogloss:1.03377
[4]	validation_0-mlogloss:1.01895
[5]	validation_0-mlogloss:1.00369
[6]	validation_0-mlogloss:0.98892
[7]	validation_0-mlogloss:0.97503
[8]	validation_0-mlogloss:0.96134
[9]	validation_0-mlogloss:0.94860
[10]	validation_0-mlogloss:0.93519
[11]	validation_0-mlogloss:0.92268
[12]	validation_0-mlogloss:0.91027
[13]	validation_0-mlogloss:0.89774
[14]	validation_0-mlogloss:0.88560
[15]	validation_0-mlogloss:0.87488
[16]	validation_0-mlogloss:0.86332
[17]	validation_0-mlogloss:0.85172
[18]	validation_0-mlogloss:0.84080
[19]	validation_0-mlogloss:0.83010
[20]	validation_0-mlogloss:0.82043
[21]	validation_0-mlogloss:0.81055
[22]	validation_0-mlogloss:0.80032
[23]	validation_0-mlogloss:0.79127
[24]	validation_0-mlogloss:0.78135
[25]	validation_0-mlogloss:0.77190
[26]	validation_0-mlogloss:0.76296
[27]	validation_0-mlogloss:0.75372
[28]	validation_0-mlogloss:0.7

In [18]:
# Train and evaluate the best LightGBM model with early stopping
best_lgb_model = lgb.LGBMClassifier(**lgb_random_search.best_params_)
best_lgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)]
)
y_pred_lgb = best_lgb_model.predict(X_test)
print("LightGBM Classification Report:")
print(classification_report(y_test, y_pred_lgb, target_names=label_encoder.classes_))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.146408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 78818
[LightGBM] [Info] Number of data points in the train set: 4099, number of used features: 2257
[LightGBM] [Info] Start training from score -1.094474
[LightGBM] [Info] Start training from score -1.107680
[LightGBM] [Info] Start training from score -1.093745
LightGBM Classification Report:
                  precision    recall  f1-score   support

'Medical Doctor'       1.00      1.00      1.00       336
         'Other'       0.85      0.84      0.85       354
  'Veterinarian'       0.84      0.84      0.84       335

        accuracy                           0.89      1025
       macro avg       0.89      0.89      0.89      1025
    weighted avg       0.89      0.89      0.89      1025



In [19]:
# Save the models
def save_model(model, model_path):
    joblib.dump(model, model_path)


In [20]:
save_model(best_xgb_model, 'xgb_model.pkl')
save_model(best_lgb_model, 'lgb_model.pkl')
vectorizer_path = 'tfidf_vectorizer.pkl'
joblib.dump(vectorizer, vectorizer_path)

['tfidf_vectorizer.pkl']

In [23]:
# Load models
def load_model(model_path):
    return joblib.load(model_path)

vectorizer_path = 'tfidf_vectorizer.pkl'
loaded_xgb_model = load_model('xgb_model.pkl')
loaded_lgb_model = load_model('lgb_model.pkl')
loaded_vectorizer = load_model(vectorizer_path)

In [24]:
# Predict labels for new comments
def predict_labels(model, vectorizer, comments):
    comments_tfidf = vectorizer.transform(comments)
    predictions = model.predict(comments_tfidf)
    predicted_labels = label_encoder.inverse_transform(predictions)
    return predicted_labels

In [55]:
new_comments = ["I treat animals at a clinic", "I write codes"]
xgb_predictions = predict_labels(loaded_xgb_model, loaded_vectorizer, new_comments)
lgb_predictions = predict_labels(loaded_lgb_model, loaded_vectorizer, new_comments)

print("XGBoost Predictions:", xgb_predictions)
print("LightGBM Predictions:", lgb_predictions)

XGBoost Predictions: ["'Veterinarian'" "'Other'"]
LightGBM Predictions: ["'Veterinarian'" "'Other'"]
