## Hyperparameter Tuning

In [8]:
import pandas as pd

In [9]:
data=pd.read_csv('trail_healthcare.csv',index_col=0)

In [10]:
# Load the dataset
data['tag'] = data['tag'].apply(lambda x: x.lower())

data['sentiment_1'] = data['sentiment_1'].apply(lambda x: x.lower())

data.dropna(inplace=True)
data.head()

Unnamed: 0,text_processed,topic,sentiment_1,tag
0,wishing didnt life anymore looking fellow depr...,0.0,negative,depression
1,happy hate live live busiest street town full ...,2.0,negative,depression
2,depressed boyfriend want leave mum died hi ive...,2.0,negative,depression
3,said something embarrassing panic attack texti...,2.0,negative,anxiety
4,someone please reassure im overly thinking anx...,1.0,negative,anxiety


In [12]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC


def train_and_evaluate_model(model, param_grid, X_train, y_train, X_val, y_val, X_test, y_test):
    # Perform GridSearchCV to find the best hyperparameters
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_model = grid_search.best_estimator_

    # Predict on the validation set
    y_pred_val = best_model.predict(X_val)

    # Evaluate the model
    val_accuracy = accuracy_score(y_val, y_pred_val)
    val_report = classification_report(y_val, y_pred_val, output_dict=True)

    # Predict on the test set
    y_pred_test = best_model.predict(X_test)

    # Evaluate the model on the test set
    test_accuracy = accuracy_score(y_test, y_pred_test)
    test_report = classification_report(y_test, y_pred_test, output_dict=True)

    return {
        "Validation Accuracy": val_accuracy,
        "Validation Report": val_report,
        "Test Accuracy": test_accuracy,
        "Test Report": test_report
    }



# Separate features and target variable
X = data[['text_processed', 'sentiment_1', 'topic']]  # Features: text and sentiment
y = data['tag']  # Target variable: tag

# Convert text data to numerical using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000)  
X_text = vectorizer.fit_transform(X['text_processed'])

# Convert sentiment to numerical using LabelEncoder
encoder = LabelEncoder()
X_sentiment = encoder.fit_transform(X['sentiment_1'])

# Combine numerical features (excluding original text)
X_numerical = pd.concat([pd.DataFrame(X_text.toarray()), pd.DataFrame(X_sentiment, columns=['sentiment_1'])],
                        axis=1)

# Convert categorical target variable to numerical using LabelEncoder
y_encoded = encoder.fit_transform(y)

# Hyperparameters to search over for each model
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
}

dt_param_grid = {
    'max_depth': [None, 10, 20],
}

svm_param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'],
}

# List of models with corresponding parameter grids
models = [("Random Forest", RandomForestClassifier(class_weight="balanced"), rf_param_grid),
          ("Decision Tree", DecisionTreeClassifier(class_weight="balanced"), dt_param_grid),
          ("SVM", SVC(class_weight="balanced"), svm_param_grid)]

# Different train-test-validation splits
splits = [0.75, 0.8, 0.85]

# Train and evaluate each model for each split
results = {}
for split in splits:
    print(f"Train-Test-Validation Split: {split}")
    # Stratified Split for data splitting (maintains class distribution)
    sss = StratifiedShuffleSplit(n_splits=1, test_size=(1 - split), random_state=42)
    for train_index, test_index in sss.split(X_numerical, y_encoded):
        X_train_val, X_test = X_numerical.iloc[train_index], X_numerical.iloc[test_index]
        y_train_val, y_test = y_encoded[train_index], y_encoded[test_index]
    for name, model, param_grid in models:
        print(f"Training and evaluating {name} model...")
        X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42)
        # Ensure string column names for features (if necessary)
        if not all(isinstance(col, str) for col in X_train.columns):
            X_train.columns = X_train.columns.astype(str)
            X_val.columns = X_val.columns.astype(str)
            X_test.columns = X_test.columns.astype(str)
        results[f"{name}_{split}"] = train_and_evaluate_model(model, param_grid, X_train, y_train, X_val, y_val,
                                                              X_test, y_test)
        print("\n")

# Convert results to DataFrame
results_df = pd.DataFrame(results).T

# Display results DataFrame
print("Results DataFrame:")
results_df


Train-Test-Validation Split: 0.75
Training and evaluating Random Forest model...


Training and evaluating Decision Tree model...


Training and evaluating SVM model...


Train-Test-Validation Split: 0.8
Training and evaluating Random Forest model...


Training and evaluating Decision Tree model...


Training and evaluating SVM model...


Train-Test-Validation Split: 0.85
Training and evaluating Random Forest model...


Training and evaluating Decision Tree model...


Training and evaluating SVM model...


Results DataFrame:


Unnamed: 0,Validation Accuracy,Validation Report,Test Accuracy,Test Report
Random Forest_0.75,0.876,"{'0': {'precision': 0.9118303571428571, 'recal...",0.8886,"{'0': {'precision': 0.911839863713799, 'recall..."
Decision Tree_0.75,0.8096,"{'0': {'precision': 0.854419410745234, 'recall...",0.8198,"{'0': {'precision': 0.847465034965035, 'recall..."
SVM_0.75,0.888533,"{'0': {'precision': 0.9105121293800539, 'recal...",0.8932,"{'0': {'precision': 0.9038382170862567, 'recal..."
Random Forest_0.8,0.8965,"{'0': {'precision': 0.9170603674540683, 'recal...",0.88425,"{'0': {'precision': 0.9094804499196572, 'recal..."
Decision Tree_0.8,0.82275,"{'0': {'precision': 0.8523965141612201, 'recal...",0.81175,"{'0': {'precision': 0.8478382930937676, 'recal..."
SVM_0.8,0.906,"{'0': {'precision': 0.9169656586365966, 'recal...",0.88975,"{'0': {'precision': 0.9020217729393468, 'recal..."
Random Forest_0.85,0.886588,"{'0': {'precision': 0.905337361530715, 'recall...",0.886667,"{'0': {'precision': 0.9086402266288952, 'recal..."
Decision Tree_0.85,0.823294,"{'0': {'precision': 0.8665937670858392, 'recal...",0.814667,"{'0': {'precision': 0.8710443037974683, 'recal..."
SVM_0.85,0.897647,"{'0': {'precision': 0.9075258239055582, 'recal...",0.889667,"{'0': {'precision': 0.9013112491373361, 'recal..."


In [100]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [27]:
results=results_df[['Validation Accuracy','Test Accuracy']]

In [28]:
results

Unnamed: 0,Validation Accuracy,Test Accuracy
Random Forest_0.75,0.876,0.8886
Decision Tree_0.75,0.8096,0.8198
SVM_0.75,0.888533,0.8932
Random Forest_0.8,0.8965,0.88425
Decision Tree_0.8,0.82275,0.81175
SVM_0.8,0.906,0.88975
Random Forest_0.85,0.886588,0.886667
Decision Tree_0.85,0.823294,0.814667
SVM_0.85,0.897647,0.889667


In [29]:
results_df

Unnamed: 0,Validation Accuracy,Validation Report,Test Accuracy,Test Report
Random Forest_0.75,0.876,"{'0': {'precision': 0.9118303571428571, 'recal...",0.8886,"{'0': {'precision': 0.911839863713799, 'recall..."
Decision Tree_0.75,0.8096,"{'0': {'precision': 0.854419410745234, 'recall...",0.8198,"{'0': {'precision': 0.847465034965035, 'recall..."
SVM_0.75,0.888533,"{'0': {'precision': 0.9105121293800539, 'recal...",0.8932,"{'0': {'precision': 0.9038382170862567, 'recal..."
Random Forest_0.8,0.8965,"{'0': {'precision': 0.9170603674540683, 'recal...",0.88425,"{'0': {'precision': 0.9094804499196572, 'recal..."
Decision Tree_0.8,0.82275,"{'0': {'precision': 0.8523965141612201, 'recal...",0.81175,"{'0': {'precision': 0.8478382930937676, 'recal..."
SVM_0.8,0.906,"{'0': {'precision': 0.9169656586365966, 'recal...",0.88975,"{'0': {'precision': 0.9020217729393468, 'recal..."
Random Forest_0.85,0.886588,"{'0': {'precision': 0.905337361530715, 'recall...",0.886667,"{'0': {'precision': 0.9086402266288952, 'recal..."
Decision Tree_0.85,0.823294,"{'0': {'precision': 0.8665937670858392, 'recal...",0.814667,"{'0': {'precision': 0.8710443037974683, 'recal..."
SVM_0.85,0.897647,"{'0': {'precision': 0.9075258239055582, 'recal...",0.889667,"{'0': {'precision': 0.9013112491373361, 'recal..."


In [None]:
# SVM_0.8: 0.906

### Transformer models 

In [61]:
data.head()

Unnamed: 0,text_processed,topic,sentiment_1,tag
0,wishing didnt life anymore looking fellow depr...,0.0,negative,depression
1,happy hate live live busiest street town full ...,2.0,negative,depression
2,depressed boyfriend want leave mum died hi ive...,2.0,negative,depression
3,said something embarrassing panic attack texti...,2.0,negative,anxiety
4,someone please reassure im overly thinking anx...,1.0,negative,anxiety


In [62]:
# Separate features and target variable
X = data[['text_processed', 'sentiment_1', 'topic']]  # Features: text and sentiment
y = data['tag']  # Target variable: tag

In [63]:
X

Unnamed: 0,text_processed,sentiment_1,topic
0,wishing didnt life anymore looking fellow depr...,negative,0.0
1,happy hate live live busiest street town full ...,negative,2.0
2,depressed boyfriend want leave mum died hi ive...,negative,2.0
3,said something embarrassing panic attack texti...,negative,2.0
4,someone please reassure im overly thinking anx...,negative,1.0
...,...,...,...
19995,anyone anxiety give ibsgut issue dont anxiety ...,negative,0.0
19996,cant trust anyone sexual abuse fucked starting...,negative,2.0
19997,stop freaking dying pls need serious help best...,negative,3.0
19998,deal anxiety im always anxious work future met...,negative,0.0


In [64]:
y

0        depression
1        depression
2        depression
3           anxiety
4           anxiety
            ...    
19995       anxiety
19996       anxiety
19997       anxiety
19998       anxiety
19999    depression
Name: tag, Length: 19999, dtype: object

In [71]:
# !pip install simpletransformers
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
X['sentiment_1']=encoder.fit_transform(X['sentiment_1'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['sentiment_1']=encoder.fit_transform(X['sentiment_1'])


In [72]:
X

Unnamed: 0,text_processed,sentiment_1,topic
0,wishing didnt life anymore looking fellow depr...,0,0.0
1,happy hate live live busiest street town full ...,0,2.0
2,depressed boyfriend want leave mum died hi ive...,0,2.0
3,said something embarrassing panic attack texti...,0,2.0
4,someone please reassure im overly thinking anx...,0,1.0
...,...,...,...
19995,anyone anxiety give ibsgut issue dont anxiety ...,0,0.0
19996,cant trust anyone sexual abuse fucked starting...,0,2.0
19997,stop freaking dying pls need serious help best...,0,3.0
19998,deal anxiety im always anxious work future met...,0,0.0


In [74]:
from simpletransformers.classification import ClassificationModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import pandas as pd



# Combine text, topic, and sentiment in X
X['combined_text'] = X['text_processed'] + " " + X['topic'].astype(str) + " " + X['sentiment_1'].astype(str)


In [76]:
X['combined_text'][0]

'wishing didnt life anymore looking fellow depressed people gauge bad im going particularly stressful part life find often wishing didnt go motion anymore thing keeping going family pet would feel something happened werent im scared id think least every couple day anyone else feel way urgent seek help im already medicated recent stress enough seeing doctor next month 0.0 0'

In [77]:
# Encode categorical target variable
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

In [78]:
y_encoded

array([1, 1, 1, ..., 0, 0, 1])

In [79]:

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X['combined_text'], y_encoded, test_size=0.25, random_state=42)

In [80]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(14999,)
(5000,)
(14999,)
(5000,)


In [83]:
# Define the model
model = ClassificationModel('bert', 'bert-base-uncased', num_labels=2,use_cuda=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [84]:
# Train the model
model.train_model(pd.DataFrame({'text': X_train, 'labels': y_train}))

  0%|          | 0/29 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 1 of 1:   0%|          | 0/1875 [00:00<?, ?it/s]

(1875, 0.3217360502722363)

In [85]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(pd.DataFrame({'text': X_test, 'labels': y_test}))

  0%|          | 0/10 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/50 [00:00<?, ?it/s]

In [87]:
result

{'mcc': 0.8337657845358625,
 'accuracy': 0.9168,
 'f1_score': 0.9164323021293692,
 'tp': 2281,
 'tn': 2303,
 'fp': 232,
 'fn': 184,
 'auroc': 0.9705967969722065,
 'auprc': 0.9671113810723218,
 'eval_loss': 0.2591300995647907}