In [None]:
# Determine which trainer has the best model
if best_model_info['Feature_Extraction'] == 'TF-IDF':
    best_trainer = trainer_tfidf
    best_feature_extractor = tfidf_extractor
    best_feature_name = 'tfidf'
else:
    best_trainer = trainer_w2v
    best_feature_extractor = w2v_extractor
    best_feature_name = 'w2v'

# Get the best model
best_model = best_trainer.models[best_model_info['Model'].lower().replace(' ', '_')]

# Save the best model
model_save_path = r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\models\best_sentiment_model.pkl'
joblib.dump(best_model, model_save_path)
print(f"Best model saved to: {model_save_path}")

# Save model metadata
metadata = {
    'model_type': best_model_info['Model'],
    'feature_extraction': best_model_info['Feature_Extraction'],
    'f1_score': best_model_info['F1-Score'],
    'accuracy': best_model_info['Accuracy'],
    'precision': best_model_info['Precision'],
    'recall': best_model_info['Recall'],
    'feature_extractor_type': best_feature_name
}

metadata_path = r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\models\model_metadata.json'
import json
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=4)
print(f"Model metadata saved to: {metadata_path}")

## Save Best Model

In [None]:
# Create comprehensive comparison
results_tfidf['Feature_Extraction'] = 'TF-IDF'
results_w2v['Feature_Extraction'] = 'Word2Vec'

all_results = pd.concat([results_tfidf, results_w2v], ignore_index=True)
all_results = all_results.sort_values('F1-Score', ascending=False)

print("\n" + "="*80)
print("COMPREHENSIVE MODEL COMPARISON (Sorted by F1-Score)")
print("="*80)
print(all_results.to_string(index=False))

# Find the best model
best_idx = all_results['F1-Score'].idxmax()
best_model_info = all_results.iloc[best_idx]
print(f"\n{'='*80}")
print(f"BEST MODEL: {best_model_info['Model']} with {best_model_info['Feature_Extraction']} features")
print(f"F1-Score: {best_model_info['F1-Score']:.4f}")
print(f"Accuracy: {best_model_info['Accuracy']:.4f}")
print(f"Precision: {best_model_info['Precision']:.4f}")
print(f"Recall: {best_model_info['Recall']:.4f}")
print(f"{'='*80}")

## Compare All Models

In [None]:
# Split data for Word2Vec features
X_train_w2v, X_test_w2v, _, _ = train_test_split(
    w2v_features, df['Sentiment'].values, 
    test_size=0.2, random_state=42, stratify=df['Sentiment'].values
)

print("\n" + "="*80)
print("TRAINING MODELS ON WORD2VEC FEATURES")
print("="*80)
trainer_w2v, results_w2v = train_and_evaluate_all_models(
    X_train_w2v, y_train, X_test_w2v, y_test
)

print("\nResults on Word2Vec Features:")
print(results_w2v.to_string(index=False))

## Train Models on Word2Vec Features

In [None]:
# Split data
X_train_tfidf, X_test_tfidf, y_train, y_test = train_test_split(
    tfidf_features, df['Sentiment'].values, 
    test_size=0.2, random_state=42, stratify=df['Sentiment'].values
)

print(f"Training set size: {X_train_tfidf.shape[0]}")
print(f"Test set size: {X_test_tfidf.shape[0]}")
print(f"Train pos/neg ratio: {(y_train==1).sum()} / {(y_train==0).sum()}")

# Train models
print("\n" + "="*80)
print("TRAINING MODELS ON TF-IDF FEATURES")
print("="*80)
trainer_tfidf, results_tfidf = train_and_evaluate_all_models(
    X_train_tfidf, y_train, X_test_tfidf, y_test
)

print("\nResults on TF-IDF Features:")
print(results_tfidf.to_string(index=False))

## Train Models on TF-IDF Features

In [None]:
# Load preprocessed data
data_path = r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\data\preprocessed_reviews.csv'
df = pd.read_csv(data_path)

print(f"Dataset loaded: {len(df)} reviews")
print(f"Target distribution: {df['Sentiment'].value_counts().to_dict()}")

# Load feature extractors and generate features
tfidf_extractor = joblib.load(r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\models\tfidf_extractor.pkl')
tfidf_features = tfidf_extractor.transform(df['cleaned_text'].values)

w2v_extractor = joblib.load(r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\models\w2v_extractor.pkl')
tokenized_texts = [text.split() for text in df['cleaned_text'].values]
w2v_features = w2v_extractor.transform(tokenized_texts)

print(f"TF-IDF Features Shape: {tfidf_features.shape}")
print(f"Word2Vec Features Shape: {w2v_features.shape}")

## Load Data and Features

In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.append(r'C:\Users\admin\Documents\Innomatics\Sentiment\sentiment_analysis_project\src')

from model_training import ModelTrainer, train_and_evaluate_all_models
from feature_extraction import BagOfWordsExtractor, TFIDFExtractor, Word2VecExtractor
from sklearn.model_selection import train_test_split
import joblib
import warnings
warnings.filterwarnings('ignore')

# 4. Model Training and Evaluation
## Training and Comparing Sentiment Classification Models

In this notebook, we will:
- Load features from different embedding techniques
- Train multiple machine learning models
- Evaluate models using F1-Score, Precision, Recall, Accuracy
- Compare model performance
- Save the best model