# Comprehensive EDA and Modeling for MedVeritas Project

This notebook performs exploratory data analysis, feature engineering, model training, and visualization.


In [None]:
import sys
from pathlib import Path

project_root = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(project_root))

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from src.data.processing import (
    load_data, clean_data, create_effectiveness_label, 
    extract_temporal_features, get_data_summary, prepare_train_test_split
)
from src.nlp.utils import extract_text_features, preprocess_reviews
from src.features.engineering import prepare_features
from src.models.train import ModelTrainer
from src.visualization.plots import (
    plot_rating_distribution_by_category, plot_wordclouds, plot_correlation_heatmap,
    plot_time_series_ratings, plot_feature_importance, plot_topic_modeling_results,
    plot_sentiment_distribution_by_condition, plot_confusion_matrix, plot_roc_curves,
    plot_top_effective_drugs_by_condition, plot_side_effects_by_category,
    plot_review_length_vs_rating
)
from gensim import corpora
from gensim.models import LdaModel


Note: you may need to restart the kernel to use updated packages.
Project root added to path: s:\Programming\MedVeritas
Current working directory: s:\Programming\MedVeritas\notebooks



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


All imports successful!


## Step 1: Data Loading and Cleaning


In [None]:
data_path = project_root / 'data' / 'processed' / 'medVe_data_final_version.xlsx'
df = load_data(str(data_path))
df = clean_data(df)
df = create_effectiveness_label(df, threshold=7)
df = extract_temporal_features(df, date_col='date')

summary = get_data_summary(df)
print("\nData Summary:")
for key, value in summary.items():
    print(f"  {key}: {value}")


Loading data from: s:\Programming\MedVeritas\data\processed\medVe_data_final_version.xlsx
File exists: True
Loading data from s:\Programming\MedVeritas\data\processed\medVe_data_final_version.xlsx...
Loaded 606077 rows and 5 columns.
Cleaning data...
Cleaned data: 606077 -> 366815 rows (239262 removed)
Created effectiveness labels (threshold=7):
  Effective (>= 7): 220975 (60.2%)
  Not Effective (< 7): 145840 (39.8%)

Data Summary:
  total_reviews: 366815
  unique_drugs: 5711
  unique_conditions: 7798
  avg_rating: 6.516794569469623
  median_rating: 8.0
  avg_review_length: 386.3398825020787


## Step 2: Text Preprocessing and Feature Extraction


In [None]:
df = extract_text_features(df, text_col='review')
df = preprocess_reviews(df, text_col='review', preprocessed_col='review_processed')


Extracting text features...
Computing sentiment scores for 366815 reviews (this may take a while)...
  Processing batch 1/37 (rows 1-10000)...
  Processing batch 2/37 (rows 10001-20000)...
  Processing batch 3/37 (rows 20001-30000)...
  Processing batch 4/37 (rows 30001-40000)...
  Processing batch 5/37 (rows 40001-50000)...
  Processing batch 6/37 (rows 50001-60000)...
  Processing batch 7/37 (rows 60001-70000)...
  Processing batch 8/37 (rows 70001-80000)...
  Processing batch 9/37 (rows 80001-90000)...
  Processing batch 10/37 (rows 90001-100000)...
  Processing batch 11/37 (rows 100001-110000)...
  Processing batch 12/37 (rows 110001-120000)...
  Processing batch 13/37 (rows 120001-130000)...
  Processing batch 14/37 (rows 130001-140000)...
  Processing batch 15/37 (rows 140001-150000)...
  Processing batch 16/37 (rows 150001-160000)...
  Processing batch 17/37 (rows 160001-170000)...
  Processing batch 18/37 (rows 170001-180000)...
  Processing batch 19/37 (rows 180001-190000)...


## Step 3: Exploratory Data Analysis - Statistical Visualizations


In [None]:
plot_rating_distribution_by_category(df, category_col='drugName', output_dir='results/figures')
plot_rating_distribution_by_category(df, category_col='condition', output_dir='results/figures')
plot_wordclouds(df, text_col='review_processed', rating_col='rating', 
                threshold=7, output_dir='results/figures')
plot_review_length_vs_rating(df, length_col='review_length', 
                             rating_col='rating', output_dir='results/figures')


3.1 Rating Distribution by Drug Category...
Creating rating distribution plots by drugName...
Saved plot: results\figures\rating_distribution_boxplot_drugName.png
Saved plot: results\figures\rating_distribution_histogram_drugName.png

3.2 Rating Distribution by Condition...
Creating rating distribution plots by condition...
Saved plot: results\figures\rating_distribution_boxplot_condition.png
Saved plot: results\figures\rating_distribution_histogram_condition.png

3.3 Word Clouds...
Creating word clouds...
Saved plot: results\figures\wordclouds_positive_vs_negative.png

3.4 Review Length vs Rating...
Creating review length vs rating plot...
Saved plot: results\figures\review_length_vs_rating.png


## Step 4: Feature Engineering


In [None]:
X, feature_info = prepare_features(
    df, 
    text_col='review_processed',
    include_tfidf=True,
    max_tfidf_features=500,
    categorical_cols=['drugName', 'condition']
)

y_classification = df['is_effective']
y_regression = df['rating']

print(f"Feature matrix shape: {X.shape}")
print(f"Classification target shape: {y_classification.shape}")
print(f"Regression target shape: {y_regression.shape}")


Preparing features for modeling...
Creating derived features...
  Created drug_popularity feature
  Created condition_popularity feature
  Created drug_avg_rating feature
  Created condition_avg_rating feature
  Created review_length_category feature
Derived features created successfully.
Encoding categorical features...
  Encoded drugName: 5711 unique values
  Encoded condition: 7798 unique values
Creating TF-IDF features (max_features=500, ngram_range=(1, 1))...
Created 500 TF-IDF features (sparse=True).
Prepared 520 features for modeling.

Feature matrix shape: (366815, 520)
Classification target shape: (366815,)
Regression target shape: (366815,)


In [None]:
numeric_cols = [col for col in X.columns if X[col].dtype in [np.int64, np.float64]]
top_numeric = numeric_cols[:30]
plot_correlation_heatmap(
    pd.concat([X[top_numeric], y_regression], axis=1),
    feature_cols=top_numeric,
    target_col='rating',
    output_dir='results/figures'
)


4.1 Correlation Heatmap...
Creating correlation heatmap...
Saved plot: results\figures\correlation_heatmap.png


## Step 5: Train-Test Split


In [None]:
train_df, test_df = prepare_train_test_split(df, test_size=0.2, random_state=42)

train_indices = train_df.index
test_indices = test_df.index

X_train = X.loc[train_indices]
X_test = X.loc[test_indices]
y_train_class = y_classification.loc[train_indices]
y_test_class = y_classification.loc[test_indices]
y_train_reg = y_regression.loc[train_indices]
y_test_reg = y_regression.loc[test_indices]

print(f"Train features shape: {X_train.shape}")
print(f"Test features shape: {X_test.shape}")


Train set: 293452 rows (80.0%)
Test set: 73363 rows (20.0%)
Train features shape: (293452, 520)
Test features shape: (73363, 520)


## Step 6: Model Training - Classification


In [None]:
trainer = ModelTrainer(random_state=42)
classification_results = trainer.train_classification_models(
    X_train, y_train_class,
    X_test, y_test_class
)


Training classification models...

Training Logistic Regression...
  Accuracy: 0.7369
  Precision: 0.7554
  Recall: 0.8331
  F1-Score: 0.7923
  ROC-AUC: 0.7977

Training Random Forest...


MemoryError: Unable to allocate 1.09 GiB for an array with shape (500, 293452) and data type float64

In [None]:
best_class_model = max(classification_results.items(), 
                      key=lambda x: x[1]['roc_auc'] if x[1]['roc_auc'] else x[1]['accuracy'])
print(f"Best Classification Model: {best_class_model[0]}")

feature_importance = trainer.get_feature_importance(
    best_class_model[0], 
    list(X.columns),
    model_type='classification'
)

if feature_importance:
    plot_feature_importance(feature_importance, top_n=15, output_dir='results/figures')



Best Classification Model: XGBoost

6.1 Feature Importance...
Creating feature importance plot...
Saved plot: results/figures\feature_importance.png


In [None]:
for model_name, results in classification_results.items():
    plot_confusion_matrix(
        y_test_class, 
        results['predictions'],
        model_name=model_name,
        output_dir='results/figures'
    )

if 'roc_data' in trainer.results and trainer.results['roc_data']:
    plot_roc_curves(trainer.results['roc_data'], output_dir='results/figures')


6.2 Confusion Matrices...
Creating confusion matrix for Logistic Regression...
Saved plot: results/figures\confusion_matrix_logistic_regression.png
Creating confusion matrix for Random Forest...
Saved plot: results/figures\confusion_matrix_random_forest.png
Creating confusion matrix for XGBoost...
Saved plot: results/figures\confusion_matrix_xgboost.png

6.3 ROC Curves...
Creating ROC curves...
Saved plot: results/figures\roc_curves_comparison.png


## Step 7: Model Training - Regression


In [None]:
regression_results = trainer.train_regression_models(
    X_train, y_train_reg,
    X_test, y_test_reg
)

best_reg_model = min(regression_results.items(), key=lambda x: x[1]['rmse'])
print(f"Best Regression Model: {best_reg_model[0]}")


Training regression models...

Training Linear Regression...
  RMSE: 2.7652
  MAE: 2.2529
  R² Score: 0.3723

Training Random Forest...
  Converting sparse matrix to dense for Random Forest (this may take a moment)...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.3min finished
[Parallel(n_jobs=20)]: Using backend ThreadingBackend with 20 concurrent workers.
[Parallel(n_jobs=20)]: Done  10 tasks      | elapsed:    0.1s
[Parallel(n_jobs=20)]: Done  50 out of  50 | elapsed:    0.3s finished


  RMSE: 2.7414
  MAE: 2.1993
  R² Score: 0.3830

Training XGBoost...
  RMSE: 2.6267
  MAE: 2.0847
  R² Score: 0.4336

Best Regression Model: XGBoost


## Step 8: Topic Modeling (LDA)


In [None]:
documents = df['review_processed'].fillna('').astype(str).tolist()
documents = [doc.split() for doc in documents if len(doc.split()) > 5]

dictionary = corpora.Dictionary(documents)
dictionary.filter_extremes(no_below=5, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in documents]

lda_model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=10,
    random_state=42,
    passes=10,
    alpha='auto',
    per_word_topics=True
)

plot_topic_modeling_results(lda_model, dictionary=dictionary, top_n_words=10, 
                           output_dir='results/figures')

print("Top words per topic:")
for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx + 1}: {topic}")


Preparing documents for LDA...
Creating dictionary and corpus...
Training LDA model...
8.1 Topic Modeling Visualization...
Creating topic modeling visualization...
Saved plot: results/figures\topic_modeling_results.png

LDA Model trained with 10 topics

Top words per topic:
Topic 1: 0.066*"effect" + 0.064*"side" + 0.024*"sleep" + 0.023*"taking" + 0.018*"anxiety" + 0.015*"help" + 0.014*"also" + 0.013*"helped" + 0.012*"take" + 0.011*"depression"
Topic 2: 0.059*"period" + 0.041*"month" + 0.039*"control" + 0.028*"cramp" + 0.026*"birth" + 0.025*"cream" + 0.019*"bleeding" + 0.016*"skyla" + 0.016*"mood" + 0.016*"inserted"
Topic 3: 0.084*"day" + 0.040*"week" + 0.031*"first" + 0.026*"taking" + 0.022*"took" + 0.021*"started" + 0.020*"time" + 0.019*"hour" + 0.015*"two" + 0.015*"felt"
Topic 4: 0.134*"infection" + 0.062*"yeast" + 0.052*"monistat" + 0.048*"antibiotic" + 0.037*"uti" + 0.034*"hair" + 0.033*"burning" + 0.022*"treatment" + 0.019*"ear" + 0.015*"cough"
Topic 5: 0.050*"work" + 0.025*"well"

## Step 9: NLP Visualizations


In [None]:
plot_sentiment_distribution_by_condition(
    df,
    sentiment_col='vader_compound',
    condition_col='condition',
    output_dir='results/figures'
)


9.1 Sentiment Distribution by Condition...
Creating sentiment distribution plot...
Saved plot: results/figures\sentiment_distribution_by_condition.png


## Step 10: Insight Visualizations


In [None]:
plot_top_effective_drugs_by_condition(
    df,
    top_n=10,
    condition_col='condition',
    drug_col='drugName',
    rating_col='rating',
    output_dir='results/figures'
)

plot_side_effects_by_category(
    df,
    text_col='review',
    category_col='drugName',
    output_dir='results/figures'
)

if 'date' in df.columns:
    plot_time_series_ratings(
        df,
        date_col='date',
        drug_col='drugName',
        rating_col='rating',
        top_n_drugs=10,
        output_dir='results/figures'
    )


10.1 Top Effective Drugs by Condition...
Creating top effective drugs plot...
Saved plot: results/figures\top_effective_drugs_by_condition.png

10.2 Side Effects by Category...
Creating side effects analysis plot...
Saved plot: results/figures\side_effects_by_category.png


## Step 11: Save Results


In [None]:
trainer.save_results(output_dir='results/metrics')

for model_name in classification_results.keys():
    trainer.save_model(model_name, 'classification', output_dir='models')

for model_name in regression_results.keys():
    trainer.save_model(model_name, 'regression', output_dir='models')


Saved classification results: results/metrics\classification_results.csv
Saved regression results: results/metrics\regression_results.csv
Saving models...
Saved model: models\classification\logistic_regression.pkl
Saved model: models\classification\random_forest.pkl
Saved model: models\classification\xgboost.pkl
Saved model: models\regression\linear_regression.pkl
Saved model: models\regression\random_forest.pkl
Saved model: models\regression\xgboost.pkl

Saving processed data...
Saved processed data: data/processed/medVe_data_processed.csv


## Step 12: Summary


In [None]:
print(f"Total reviews: {len(df):,}")
print(f"Unique drugs: {df['drugName'].nunique():,}")
print(f"Unique conditions: {df['condition'].nunique():,}")
print(f"Average rating: {df['rating'].mean():.2f}")
print(f"Effective reviews (>=7): {df['is_effective'].sum():,} ({df['is_effective'].mean()*100:.1f}%)")

print("\nClassification Models:")
for name, results in classification_results.items():
    print(f"  {name}: Accuracy={results['accuracy']:.4f}, F1={results['f1_score']:.4f}, ROC-AUC={results.get('roc_auc', 0):.4f}")

print("\nRegression Models:")
for name, results in regression_results.items():
    print(f"  {name}: RMSE={results['rmse']:.4f}, MAE={results['mae']:.4f}, R²={results['r2_score']:.4f}")

print("\nResults saved to: results/figures/, results/metrics/, models/")



Total reviews analyzed: 366815
Unique drugs: 5711
Unique conditions: 7798
Average rating: 6.52
Effective reviews (>=7): 220975 (60.2%)

Classification Models:
  Logistic Regression:
    Accuracy: 0.7347
    F1-Score: 0.7923
    ROC-AUC: 0.7953
  Random Forest:
    Accuracy: 0.7612
    F1-Score: 0.8162
    ROC-AUC: 0.8340
  XGBoost:
    Accuracy: 0.7858
    F1-Score: 0.8282
    ROC-AUC: 0.8628

Regression Models:
  Linear Regression:
    RMSE: 2.7652
    MAE: 2.2529
    R² Score: 0.3723
  Random Forest:
    RMSE: 2.7414
    MAE: 2.1993
    R² Score: 0.3830
  XGBoost:
    RMSE: 2.6267
    MAE: 2.0847
    R² Score: 0.4336

EDA AND MODELING COMPLETE!

All visualizations saved to: results/figures/
All metrics saved to: results/metrics/
All models saved to: models/
