In [3]:
from utils import TextPreprocessor, ColumnSelector, BodyTfidfTransformer, TitleTfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split
from joblib import dump
import pandas as pd

# Load configuration from YAML file
config = utils.load_config('config.yaml')
data_path = config['data_path']
test_size = config['test_size']
random_state = config['random_state']
tfidf_params_title = config['tfidf_params_title']
tfidf_params_body = config['tfidf_params_body']
ridge_classifier_params = config['ridge_classifier_params']
body_weight = config['body_weight']

# Load data using the path from the config
df = pd.read_csv(data_path)

# Define your target variable and features
X = df[['title', 'body']]
y = df['category']

# Create a pipeline for processing text data
pipeline = Pipeline([
    ('preprocessor', utils.TextPreprocessor()),
    ('features', FeatureUnion([
        ('title_tfidf', Pipeline([
            ('selector', utils.ColumnSelector('title_cleaned')),
            ('tfidf', utils.TitleTfidfTransformer(tfidf_params=tfidf_params_title))
        ])),
        ('body_tfidf', Pipeline([
            ('selector', utils.ColumnSelector('body_cleaned')),
            ('tfidf', utils.BodyTfidfTransformer(weight=body_weight, tfidf_params=tfidf_params_body))
        ]))
    ])),
    ('classifier', RidgeClassifier(**ridge_classifier_params))
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the test data
accuracy = pipeline.score(X_test, y_test)
print(f"Model accuracy: {accuracy:.2f}")

# Save the pipeline to a file
dump(pipeline, 'models/pipeline.joblib')


2024-09-01 01:58:10,411 - INFO - Starting text preprocessing...
2024-09-01 01:58:29,429 - INFO - Text preprocessing completed.
2024-09-01 01:58:29,430 - INFO - Selecting column: title_cleaned
2024-09-01 01:58:29,431 - INFO - Fitting TitleTfidfTransformer...
2024-09-01 01:58:29,501 - INFO - Transforming title text to TF-IDF features...
2024-09-01 01:58:29,546 - INFO - Selecting column: body_cleaned
2024-09-01 01:58:29,547 - INFO - Fitting BodyTfidfTransformer...
2024-09-01 01:58:31,422 - INFO - Transforming body text to TF-IDF features...
2024-09-01 01:58:34,213 - INFO - Starting text preprocessing...
2024-09-01 01:58:39,003 - INFO - Text preprocessing completed.
2024-09-01 01:58:39,005 - INFO - Selecting column: title_cleaned
2024-09-01 01:58:39,006 - INFO - Transforming title text to TF-IDF features...
2024-09-01 01:58:39,022 - INFO - Selecting column: body_cleaned
2024-09-01 01:58:39,023 - INFO - Transforming body text to TF-IDF features...


Model accuracy: 0.84


['models/pipeline.joblib']