In [7]:
from text_analytics.sentiment_analysis.random_forest import RandomForest
import pandas as pd 
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from text_analytics.config import RAW_DATA_PATH, SENTIMENT_CLEANED_DATA_PATH, RANDOM_STATE


In [8]:
df = pd.read_csv(SENTIMENT_CLEANED_DATA_PATH)
X = df['preprocessed_review']
y = df['class']
sum(y==1)/len(y)  #class balanced


0.5018756806905732

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)

In [10]:
count_vectorizer = CountVectorizer(ngram_range=(1,2), min_df=10) 

X_train = count_vectorizer.fit_transform(X_train)
X_test = count_vectorizer.transform(X_test)

In [11]:
print([file.shape for file in (X_train, X_test)])

[(39665, 61088), (9917, 61088)]


In [12]:
rf = RandomForest(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test
)

In [14]:
rf.evaluate() 

Fitting must be done before evaluating of hyperparameter tuning process


AttributeError: 'NoneType' object has no attribute 'estimator'

In [8]:
# ~ 30 mins to run 
tuner, bst_model = rf.train() 

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 3/5; 1/1] START rf__colsample_bynode=0.675, rf__learning_rate=1.0, rf__max_depth=7, rf__n_estimators=100, rf__reg_lambda=0.875, rf__subsample=0.7
[CV 5/5; 1/1] START rf__colsample_bynode=0.675, rf__learning_rate=1.0, rf__max_depth=7, rf__n_estimators=100, rf__reg_lambda=0.875, rf__subsample=0.7[CV 4/5; 1/1] START rf__colsample_bynode=0.675, rf__learning_rate=1.0, rf__max_depth=7, rf__n_estimators=100, rf__reg_lambda=0.875, rf__subsample=0.7
[CV 2/5; 1/1] START rf__colsample_bynode=0.675, rf__learning_rate=1.0, rf__max_depth=7, rf__n_estimators=100, rf__reg_lambda=0.875, rf__subsample=0.7

[CV 1/5; 1/1] START rf__colsample_bynode=0.675, rf__learning_rate=1.0, rf__max_depth=7, rf__n_estimators=100, rf__reg_lambda=0.875, rf__subsample=0.7
[CV 5/5; 1/1] END rf__colsample_bynode=0.675, rf__learning_rate=1.0, rf__max_depth=7, rf__n_estimators=100, rf__reg_lambda=0.875, rf__subsample=0.7; AUC: (train=0.812, test=0.792) F_score: (t

In [12]:
y_pred = rf.best_model.predict(rf.X_test)
y_pred_proba = rf.best_model.predict_proba(rf.X_test)[:, 1]

report = calculate_report_metrics(
    y_test=rf.y_test, y_pred=y_pred, y_pred_prob=y_pred_proba
)


    -----------
    PERFORMANCE
    -----------
    ACCURACY: 71.35%
    PRECISION: 66.12%
    RECALL: 88.00%
    F1: 75.51%
    ROC AUC: 79.70%
    


In [41]:
import numpy.typing as npt 
import matplotlib.pyplot as plt 
import seaborn as sns 
import numpy as np 
from text_analytics.config import ARTIFACTS_PATH
def save_confusion_matrix(cf_matrix: npt.ArrayLike, model_name: str) -> None:
    _, ax = plt.subplots(figsize=(12, 8))

    sns.heatmap(
        np.eye(2),
        annot=cf_matrix,
        fmt=".2%",
        annot_kws={"size": 50},
        cmap="YlGnBu",
        cbar=False,
        xticklabels=["Negative", "Positive"],
        yticklabels=["Negative", "Positive"],
        ax=ax,
    )

    ax.set_xlabel("Predicted Sentiment", size=20)
    ax.set_ylabel("Actual Sentiment", size=20)
    plt.savefig(f"{ARTIFACTS_PATH}/confusion_matrix/{model_name}.jpeg")
    plt.clf()

In [30]:
rf.load_model(file_name="sentiment_xgb_rf_12_13_13")

In [8]:
rf.best_model.predict(X_test)

array([1, 1, 0, ..., 0, 1, 1])

In [9]:
from text_analytics.helpers import calculate_report_metrics

In [11]:
rf.evaluate()

AttributeError: 'NoneType' object has no attribute 'estimator'

In [10]:
y_pred = rf.best_model.predict(rf.X_test)
y_pred_proba = rf.best_model.predict_proba(rf.X_test)[:, 1]

report = calculate_report_metrics(
    y_test=rf.y_test, y_pred=y_pred, y_pred_prob=y_pred_proba
)


    -----------
    PERFORMANCE
    -----------
    ACCURACY: 71.23%
    PRECISION: 65.96%
    RECALL: 88.21%
    F1: 75.47%
    ROC AUC: 79.82%
    
