In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from src.pipeline import TextAnalysisPipeline  
from src.agents.sentiment_agent import SentimentAgent
from src.agents.grammar_agent import GrammarAgent
from src.agents.url_agent import URLAgent
from src.agents.domain_agent import DomainAgent
from src.agents.cleaning_agent import CleaningAgent
from src.decision_engine import DecisionEngine
from ax import optimize
from hydra import initialize, compose

## Data:

In [3]:
data = pd.read_csv('spam_dataset_examples.csv')
val_df = data.groupby("label")\
             .apply(lambda group: group.sample(frac=0.5, random_state=42), include_groups=False)\
             .reset_index()\
             .set_index("level_1")
test_df = data.drop(val_df.index)

## Config:

In [4]:
with initialize(config_path="configs", version_base=None):
    cfg = compose(config_name="config")  # Adjust the config_name as necessary

decision_engine = DecisionEngine(cfg)

## Aggregate predictions:

In [48]:
sentiment_agent = SentimentAgent(cfg)
grammar_agent = GrammarAgent(cfg)
url_agent = URLAgent(cfg)

# Aggregate spam likelihood for validation set
val_spam_likelihoods = []
for index, row in val_df.iterrows():
    sentiment_result = sentiment_agent.analyze(row["text"])
    grammar_result = grammar_agent.analyze(row["text"])
    url_result = url_agent.analyze(row["text"])
    
    val_spam_likelihoods.append({
        "sentiment": sentiment_result["spam_likelihood"],
        "grammar": grammar_result["spam_likelihood"],
        "url": url_result["spam_likelihood"],
        "label": row["label"]
    })

# Aggregate spam likelihood for test set
test_spam_likelihoods = []
for index, row in test_df.iterrows():
    sentiment_result = sentiment_agent.analyze(row["text"])
    grammar_result = grammar_agent.analyze(row["text"])
    url_result = url_agent.analyze(row["text"])
    
    test_spam_likelihoods.append({
        "sentiment": sentiment_result["spam_likelihood"],
        "grammar": grammar_result["spam_likelihood"],
        "url": url_result["spam_likelihood"],
        "label": row["label"]
    })


Device set to use cpu


## Auxilary function:

In [49]:
def predict_val(weights):
    denom = sum(weights.values())
    weights = {k: v/denom for k, v in weights.items()}
    decision_engine.weights = weights
    y_true = []
    y_pred = []

    for index, likelihood in enumerate(val_spam_likelihoods):
        result = decision_engine.make_decision(
            sentiment_result={"spam_likelihood": likelihood["sentiment"], "reasoning": "Sentiment analysis"},
            grammar_result={"spam_likelihood": likelihood["grammar"], "reasoning": "Grammar analysis"},
            url_result={"spam_likelihood": likelihood["url"], "reasoning": "URL analysis"}
        )
        y_pred.append(result['is_spam'])
        y_true.append(likelihood["label"])

    f1 = f1_score(y_true, y_pred)
    return f1

def predict_test(weights):
    denom = sum(weights.values())
    weights = {k: v/denom for k, v in weights.items()}
    decision_engine.weights = weights
    y_true = []
    y_pred = []

    for index, likelihood in enumerate(test_spam_likelihoods):
        result = decision_engine.make_decision(
            sentiment_result={"spam_likelihood": likelihood["sentiment"], "reasoning": "Sentiment analysis"},
            grammar_result={"spam_likelihood": likelihood["grammar"], "reasoning": "Grammar analysis"},
            url_result={"spam_likelihood": likelihood["url"], "reasoning": "URL analysis"}
        )
        y_pred.append(result['is_spam'])
        y_true.append(likelihood["label"])

    f1 = f1_score(y_true, y_pred)
    return f1

In [53]:
best_parameters, best_values, experiment, model = optimize(
    evaluation_function=predict_val,
    parameters=[
        {"name": "sentiment", "type": "range", "bounds": [0.2, 0.7]},
        {"name": "grammar", "type": "range", "bounds": [0.2, 0.7]},
        {"name": "url", "type": "range", "bounds": [0.2, 0.7]},
        # {"name": "spam_threshold", "type": "range", "bounds": [0, 1]}
    ],
    total_trials=100,
    minimize=False
)

[INFO 03-29 21:55:58] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter sentiment. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-29 21:55:58] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter grammar. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-29 21:55:58] ax.service.utils.instantiation: Inferred value type of ParameterType.FLOAT for parameter url. If that is not the expected value type, you can explicitly specify 'value_type' ('int', 'float', 'bool' or 'str') in parameter dict.
[INFO 03-29 21:55:58] ax.service.utils.instantiation: Created search space: SearchSpace(parameters=[RangeParameter(name='sentiment', parameter_type=FLOAT, range=[0.2, 0.7]), RangeParameter(name='grammar', parameter_type=FLOAT, range=[0.2, 0.7]

### predict over test set:

In [54]:
predict_test(best_parameters)

0.391304347826087

In [55]:
best_parameters

{'sentiment': 0.6051302746623601, 'grammar': 0.22829291622531098, 'url': 0.2}