In [None]:
### Notebook 3: Feature Engineering
### Project: Churn Prevention System
### This notebook creates advanced features for better predictions

import sys
import os

### Add the Dashboards folder to Python's search path
dashboards_path = os.path.abspath(os.path.join('..', 'Dashboards'))
sys.path.insert(0, dashboards_path)

# sys.path.append('..')  # Go up to project root

from feature_engineering import engineer_features

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)

print("=" * 70)
print("FEATURE ENGINEERING FOR CHURN PREDICTION")
print("=" * 70)

### =============================================================================
### PART 1: LOAD DATA
### =============================================================================

print("\n" + "=" * 70)
print("PART 1: LOADING DATA")
print("=" * 70)

df = pd.read_csv('../Datasets/customer_churn_data.csv')
print(f"✅ Loaded {len(df):,} customer records")
print(f"   Original features: {len(df.columns)}")

### =============================================================================
### PART 1.5: ENGINEERING FEATURES
### =============================================================================

df_processed = engineer_features(df)
print(f"Features engineered: {df_processed.shape}")
print(df_processed.head())

### =============================================================================
### PART 2: ENGAGEMENT FEATURES
### =============================================================================

print("\n" + "=" * 70)
print("PART 2: CREATING ENGAGEMENT FEATURES")
print("=" * 70)

print("\nEngineering composite engagement metrics...")

### 1. Overall Engagement Score
### Weighted combination of key engagement metrics
df['engagement_score'] = (
    df['logins_30d'] * 0.3 +
    df['features_used'] * 0.3 +
    df['session_duration_avg'] * 0.2 +
    df['power_feature_usage'] * 0.2
)
print("✅ Created: engagement_score (weighted composite)")

### 2. Activity Recency Score
### Inverse of days since last login (normalized)
df['activity_recency'] = 1 / (df['days_since_last_login'] + 1)
print("✅ Created: activity_recency (recency metric)")

### 3. Usage Efficiency
### How efficiently customers use the product (features per login)
df['usage_efficiency'] = df['features_used'] / (df['logins_30d'] + 1)
print("✅ Created: usage_efficiency (features per login)")

### 4. Power User Score
### Combination of advanced feature usage and frequency
df['power_user_score'] = (
    df['power_feature_usage'] * df['logins_30d']
) / (df['tenure_days'] / 30 + 1)  # Normalized by tenure
print("✅ Created: power_user_score (advanced usage metric)")

### 5. Engagement Velocity
### Simulated metric showing trend (in real scenario, compare 30d vs 60d metrics)
### For synthetic data, we'll add some variability
np.random.seed(42)
df['engagement_velocity'] = np.random.uniform(-0.3, 0.3, len(df))
### Make it negative for churned customers
df.loc[df['churned'] == 1, 'engagement_velocity'] = np.random.uniform(
    -0.5, -0.1, (df['churned'] == 1).sum()
)
print("✅ Created: engagement_velocity (trend indicator)")

### =============================================================================
### PART 3: CUSTOMER HEALTH FEATURES
### =============================================================================

print("\n" + "=" * 70)
print("PART 3: CREATING CUSTOMER HEALTH FEATURES")
print("=" * 70)

print("\nEngineering customer health scores...")

### 1. Overall Health Score (0-100)
df['health_score'] = (
    ### Engagement component (30 points)
    (df['logins_30d'] / df['logins_30d'].max() * 30) +
    ### Session quality (20 points)
    (df['session_duration_avg'] / df['session_duration_avg'].max() * 20) +
    ### Feature adoption (20 points)
    (df['features_used'] / df['features_used'].max() * 20) +
    ### Sentiment (15 points)
    ((df['ticket_sentiment'] + 1) / 2 * 15) +
    ### Loyalty (15 points)
    (df['net_promoter_score'] / 10 * 15)
)
print("✅ Created: health_score (0-100 composite)")

### 2. Risk Flags
### Binary indicators for high-risk behavior
df['dormancy_risk'] = (df['days_since_last_login'] > 14).astype(int)
df['low_engagement_risk'] = (df['logins_30d'] < 5).astype(int)
df['support_risk'] = (df['support_tickets_30d'] > 3).astype(int)
df['payment_risk'] = (df['payment_failures'] > 0).astype(int)
df['sentiment_risk'] = (df['ticket_sentiment'] < 0).astype(int)

### Combined risk flag
df['high_risk_flag'] = (
    df['dormancy_risk'] |
    df['low_engagement_risk'] |
    df['support_risk'] |
    df['payment_risk']
).astype(int)

print("✅ Created: 6 risk flag features")

### 3. Customer Lifecycle Stage
df['lifecycle_stage'] = pd.cut(
    df['tenure_days'],
    bins=[0, 30, 90, 180, 365, 10000],
    labels=['new', 'onboarding', 'growing', 'mature', 'loyal']
)
print("✅ Created: lifecycle_stage (categorical)")

### =============================================================================
### PART 4: SUPPORT & SATISFACTION FEATURES
### =============================================================================

print("\n" + "=" * 70)
print("PART 4: CREATING SUPPORT & SATISFACTION FEATURES")
print("=" * 70)

print("\nEngineering support interaction features...")

### 1. Support Intensity
### Tickets normalized by tenure
df['support_intensity'] = df['support_tickets_30d'] / ((df['tenure_days'] / 30) + 1)
print("✅ Created: support_intensity (tickets per month of tenure)")

### 2. Support Quality Score
### Combination of ticket volume and sentiment
df['support_quality_score'] = (
    (1 - df['support_intensity'].clip(0, 1)) * 0.5 +  # Lower tickets = better
    ((df['ticket_sentiment'] + 1) / 2) * 0.5  # Higher sentiment = better
) * 100
print("✅ Created: support_quality_score (0-100)")

### 3. NPS Category
df['net_promoter_score_category'] = pd.cut(
    df['net_promoter_score'],
    bins=[-1, 6, 8, 11],
    labels=['detractor', 'passive', 'promoter']
)
print("✅ Created: nps_category (promoter/passive/detractor)")

### 4. Customer Satisfaction Index
### Composite of NPS and sentiment
df['satisfaction_index'] = (
    (df['net_promoter_score'] / 10 * 0.6) +
    ((df['ticket_sentiment'] + 1) / 2 * 0.4)
) * 100
print("✅ Created: satisfaction_index (0-100)")

### =============================================================================
### PART 5: USAGE & VALUE FEATURES
### =============================================================================

print("\n" + "=" * 70)
print("PART 5: CREATING USAGE & VALUE FEATURES")
print("=" * 70)

print("\nEngineering usage and value features...")

### 1. Usage Ratio Categories
df['usage_category'] = pd.cut(
    df['usage_vs_plan'],
    bins=[0, 0.3, 0.6, 1.0],
    labels=['low', 'medium', 'high']
)
print("✅ Created: usage_category (low/medium/high)")

### 2. Value Realization Score
### How much value customer gets vs pays
df['value_realization'] = df['usage_vs_plan'] * df['engagement_score']
print("✅ Created: value_realization (usage × engagement)")

### 3. Revenue Risk Score
### Combines MRR with churn risk indicators
df['revenue_risk_score'] = df['monthly_reoccuring_revenue'] * (
    df['high_risk_flag'] * 0.5 +
    (1 - df['health_score'] / 100) * 0.5
)
print("✅ Created: revenue_risk_score (Monthly Reoccuring Revenue-weighted risk)")

### 4. Customer Lifetime Value (Estimated)
### Simple LTV calculation based on MRR and tenure
avg_customer_lifetime_months = 24  # assumption
df['estimated_ltv'] = df['monthly_reoccuring_revenue'] * avg_customer_lifetime_months * (
    1 - (df['high_risk_flag'] * 0.3)  # Discount for high risk
)
print("✅ Created: estimated_ltv (projected lifetime value)")

### =============================================================================
### PART 6: INTERACTION FEATURES
### =============================================================================

print("\n" + "=" * 70)
print("PART 6: CREATING INTERACTION FEATURES")
print("=" * 70)

print("\nEngineering feature interactions...")

### 1. Engagement × Tenure
df['engagement_tenure'] = df['engagement_score'] * np.log1p(df['tenure_days'])
print("✅ Created: engagement_tenure (engagement × log(tenure))")

### 2. Usage × Satisfaction
df['usage_satisfaction'] = df['usage_vs_plan'] * ((df['ticket_sentiment'] + 1) / 2)
print("✅ Created: usage_satisfaction (usage × sentiment)")

### 3. Logins × Features
df['login_feature_interaction'] = df['logins_30d'] * df['features_used']
print("✅ Created: login_feature_interaction (logins × features)")

### 4. Subscription tier × Engagement
### Encode subscription tier first for this calculation
subscription_tier_encoding = {'free': 0, 'basic': 1, 'premium': 2}
df['subscription_tier_numeric'] = df['subscription_tier'].map(subscription_tier_encoding)
df['subscription_tier_engagement'] = df['subscription_tier_numeric'] * df['engagement_score']
print("✅ Created: subscription_tier_engagement (subscription_tier × engagement)")

### =============================================================================
### PART 7: TIME-BASED FEATURES
### =============================================================================

print("\n" + "=" * 70)
print("PART 7: CREATING TIME-BASED FEATURES")
print("=" * 70)

print("\nEngineering temporal features...")

### 1. Tenure Segments (more granular)
df['tenure_segment'] = pd.cut(
    df['tenure_days'],
    bins=[0, 30, 60, 90, 180, 365, 730, 10000],
    labels=['0-1mo', '1-2mo', '2-3mo', '3-6mo', '6-12mo', '1-2yr', '2yr+']
)
print("✅ Created: tenure_segment (7 categories)")

### 2. Tenure Velocity (value per day)
df['tenure_velocity'] = df['engagement_score'] / (df['tenure_days'] + 1)
print("✅ Created: tenure_velocity (engagement per day)")

### 3. Recent Activity Flag
df['recently_active'] = (df['days_since_last_login'] <= 7).astype(int)
print("✅ Created: recently_active (binary flag)")

### 4. Stale Account Flag
df['stale_account'] = (df['days_since_last_login'] > 30).astype(int)
print("✅ Created: stale_account (binary flag)")

### =============================================================================
### PART 8: ENCODE CATEGORICAL FEATURES
### =============================================================================

print("\n" + "=" * 70)
print("PART 8: ENCODING CATEGORICAL FEATURES")
print("=" * 70)

print("\nEncoding categorical variables for modeling...")

### 1. Subscription Tier
le_tier = LabelEncoder()
df['subscription_tier_encoded'] = le_tier.fit_transform(df['subscription_tier'])
print(f"✅ Encoded: subscription_tier → tier_encoded")
print(f"   Mapping: {dict(zip(le_tier.classes_, le_tier.transform(le_tier.classes_)))}")

### 2. Company Size
le_size = LabelEncoder()
df['size_encoded'] = le_size.fit_transform(df['company_size'])
print(f"✅ Encoded: company_size → size_encoded")
print(f"   Mapping: {dict(zip(le_size.classes_, le_size.transform(le_size.classes_)))}")

### 3. Industry
le_industry = LabelEncoder()
df['industry_encoded'] = le_industry.fit_transform(df['industry'])
print(f"✅ Encoded: industry → industry_encoded")
print(f"   Mapping: {dict(zip(le_industry.classes_, le_industry.transform(le_industry.classes_)))}")

### 4. Lifecycle Stage
le_lifecycle = LabelEncoder()
df['lifecycle_encoded'] = le_lifecycle.fit_transform(df['lifecycle_stage'])
print(f"✅ Encoded: lifecycle_stage → lifecycle_encoded")

### 5. NPS Category
le_nps = LabelEncoder()
df['net_promoter_score_category_encoded'] = le_nps.fit_transform(df['net_promoter_score_category'])
print(f"✅ Encoded: net_promoter_score_category → net_promoter_score_category_encoded")

### 6. Usage Category
le_usage = LabelEncoder()
df['usage_category_encoded'] = le_usage.fit_transform(df['usage_category'])
print(f"✅ Encoded: usage_category → usage_category_encoded")

### =============================================================================
### PART 9: FEATURE SUMMARY
### =============================================================================

print("\n" + "=" * 70)
print("PART 9: FEATURE ENGINEERING SUMMARY")
print("=" * 70)

original_features = [
    'customer_id', 'signup_date', 'tenure_days', 'subscription_tier',
    'monthly_reoccuring_revenue', 'company_size', 'industry', 'logins_30d', 'session_duration_avg',
    'features_used', 'power_feature_usage', 'days_since_last_login',
    'support_tickets_30d', 'ticket_sentiment', 'net_promoter_score',
    'payment_failures', 'usage_vs_plan', 'churned'
]

new_features = [col for col in df.columns if col not in original_features]

print(f"\n📊 Feature Engineering Results:")
print(f"   • Original features: {len(original_features)}")
print(f"   • New features created: {len(new_features)}")
print(f"   • Total features: {len(df.columns)}")

print(f"\n✨ New Features by Category:")

engagement_features = [f for f in new_features if 'engagement' in f or 'activity' in f or 'usage_efficiency' in f or 'power_user' in f]
print(f"\n   Engagement Features ({len(engagement_features)}):")
for f in engagement_features:
    print(f"   • {f}")

health_features = [f for f in new_features if 'health' in f or 'risk' in f]
print(f"\n   Health & Risk Features ({len(health_features)}):")
for f in health_features:
    print(f"   • {f}")

support_features = [f for f in new_features if 'support' in f or 'satisfaction' in f or 'net_promoter_score' in f]
print(f"\n   Support & Satisfaction Features ({len(support_features)}):")
for f in support_features:
    print(f"   • {f}")

value_features = [f for f in new_features if 'value' in f or 'revenue' in f or 'ltv' in f]
print(f"\n   Value Features ({len(value_features)}):")
for f in value_features:
    print(f"   • {f}")

interaction_features = [f for f in new_features if 'interaction' in f or f.count('_') > 1]
print(f"\n   Interaction Features ({len(interaction_features)}):")
for f in interaction_features:
    print(f"   • {f}")

### =============================================================================
### PART 10: FEATURE CORRELATIONS WITH TARGET
### =============================================================================

print("\n" + "=" * 70)
print("PART 10: ANALYZING NEW FEATURE CORRELATIONS")
print("=" * 70)

### Select numeric features only
numeric_features = df.select_dtypes(include=[np.number]).columns.tolist()
numeric_features.remove('churned')

### Calculate correlations with churn
correlations = df[numeric_features + ['churned']].corr()['churned'].drop('churned')
correlations_sorted = correlations.abs().sort_values(ascending=False)

print("\n🎯 Top 20 Features by Correlation with Churn:")
print("-" * 70)
for i, (feature, corr_value) in enumerate(correlations_sorted.head(20).items(), 1):
    actual_corr = correlations[feature]
    direction = "↑" if actual_corr > 0 else "↓"
    strength = "🔴" if abs(actual_corr) > 0.5 else "🟡" if abs(actual_corr) > 0.3 else "🟢"
    is_new = "✨ NEW" if feature in new_features else ""
    print(f"{i:2d}. {strength} {feature:35s} {actual_corr:6.3f} {direction} {is_new}")

### =============================================================================
### PART 11: VISUALIZATIONS
### =============================================================================

print("\n" + "=" * 70)
print("PART 11: CREATING FEATURE VISUALIZATIONS")
print("=" * 70)

fig, axes = plt.subplots(3, 3, figsize=(18, 15))

### 1. Engagement Score Distribution
ax1 = axes[0, 0]
df[df['churned']==0]['engagement_score'].hist(bins=50, alpha=0.6, label='Retained',
                                               ax=ax1, color='#4ECDC4')
df[df['churned']==1]['engagement_score'].hist(bins=50, alpha=0.6, label='Churned',
                                               ax=ax1, color='#FF6B6B')
ax1.set_title('Engagement Score Distribution', fontsize=12, fontweight='bold')
ax1.set_xlabel('Engagement Score')
ax1.legend()

### 2. Health Score Distribution
ax2 = axes[0, 1]
df[df['churned']==0]['health_score'].hist(bins=50, alpha=0.6, label='Retained',
                                          ax=ax2, color='#4ECDC4')
df[df['churned']==1]['health_score'].hist(bins=50, alpha=0.6, label='Churned',
                                          ax=ax2, color='#FF6B6B')
ax2.set_title('Health Score Distribution', fontsize=12, fontweight='bold')
ax2.set_xlabel('Health Score (0-100)')
ax2.legend()

### 3. Risk Flag Impact
ax3 = axes[0, 2]
risk_impact = df.groupby('high_risk_flag')['churned'].mean() * 100
risk_impact.plot(kind='bar', ax=ax3, color=['#4ECDC4', '#FF6B6B'])
ax3.set_title('Churn Rate by Risk Flag', fontsize=12, fontweight='bold')
ax3.set_ylabel('Churn Rate (%)')
ax3.set_xlabel('High Risk Flag')
ax3.set_xticklabels(['No Risk', 'High Risk'], rotation=0)

### 4. Lifecycle Stage Impact
ax4 = axes[1, 0]
lifecycle_churn = df.groupby('lifecycle_stage')['churned'].mean() * 100
lifecycle_churn.plot(kind='bar', ax=ax4, color='#033052')
ax4.set_title('Churn Rate by Lifecycle Stage', fontsize=12, fontweight='bold')
ax4.set_ylabel('Churn Rate (%)')
ax4.set_xlabel('Lifecycle Stage')
ax4.set_xticklabels(ax4.get_xticklabels(), rotation=45)

### 5. Net Promoter Score Category Impact
ax5 = axes[1, 1]
net_promoter_score_churn = df.groupby('net_promoter_score_category')['churned'].mean() * 100
net_promoter_score_churn.plot(kind='bar', ax=ax5, color=['#FF6B6B', '#FFA07A', '#4ECDC4'])
ax5.set_title('Churn Rate by Net Promoter Score Category', fontsize=12, fontweight='bold')
ax5.set_ylabel('Churn Rate (%)')
ax5.set_xlabel('Net Promot Score Category')
ax5.set_xticklabels(ax5.get_xticklabels(), rotation=45)

### 6. Usage Category Impact
ax6 = axes[1, 2]
usage_churn = df.groupby('usage_category')['churned'].mean() * 100
usage_churn.plot(kind='bar', ax=ax6, color=['#FF6B6B', '#FFA07A', '#4ECDC4'])
ax6.set_title('Churn Rate by Usage Category', fontsize=12, fontweight='bold')
ax6.set_ylabel('Churn Rate (%)')
ax6.set_xlabel('Usage Category')
ax6.set_xticklabels(ax6.get_xticklabels(), rotation=45)

### 7. Engagement vs Health Score (colored by churn)
ax7 = axes[2, 0]
scatter = ax7.scatter(df['engagement_score'], df['health_score'],
                     c=df['churned'], cmap='RdYlGn_r', alpha=0.5, s=20)
ax7.set_title('Engagement vs Health Score', fontsize=12, fontweight='bold')
ax7.set_xlabel('Engagement Score')
ax7.set_ylabel('Health Score')
plt.colorbar(scatter, ax=ax7, label='Churned')

### 8. Top 10 Features Correlation Heatmap
ax8 = axes[2, 1]
top_10 = correlations_sorted.head(10).index.tolist() + ['churned']
corr_matrix = df[top_10].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdYlGn_r',
            center=0, ax=ax8, cbar_kws={'shrink': 0.8})
ax8.set_title('Top 10 Features Correlation', fontsize=12, fontweight='bold')

### 9. Feature Importance (by correlation)
ax9 = axes[2, 2]
top_features = correlations_sorted.head(10)
y_pos = np.arange(len(top_features))
ax9.barh(y_pos, top_features.values, color='#033052')
ax9.set_yticks(y_pos)
ax9.set_yticklabels(top_features.index, fontsize=8)
ax9.set_xlabel('Absolute Correlation')
ax9.set_title('Top 10 Features by Correlation', fontsize=12, fontweight='bold')
ax9.invert_yaxis()

plt.tight_layout()
plt.savefig('../Datasets/feature_engineering_analysis.png', dpi=300, bbox_inches='tight')
print("\n✅ Saved visualization: ../Datasets/feature_engineering_analysis.png")

### =============================================================================
### PART 12: SAVE ENGINEERED DATASET
### =============================================================================

print("\n" + "=" * 70)
print("PART 12: SAVING ENGINEERED DATASET")
print("=" * 70)

### Save full dataset with all features
output_file = '../Datasets/customer_churn_engineered.csv'
df.to_csv(output_file, index=False)
print(f"\n✅ Saved engineered dataset: {output_file}")
print(f"   Shape: {df.shape[0]:,} rows × {df.shape[1]} columns")
print(f"   Size: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB")

### Save feature list for modeling
modeling_features = [f for f in numeric_features if f not in ['customer_id']]
feature_info = {
    'all_features': list(df.columns),
    'modeling_features': modeling_features,
    'categorical_features': ['subscription_tier', 'company_size', 'industry',
                            'lifecycle_stage', 'net_promoter_score_category', 'usage_category'],
    'encoded_features': ['subscription_tier_encoded', 'size_encoded', 'industry_encoded',
                        'lifecycle_encoded', 'net_promoter_score_category_encoded', 'usage_category_encoded'],
    'engineered_features': new_features,
    'top_features': correlations_sorted.head(20).index.tolist()
}

import pickle
with open('../Datasets/feature_info.pkl', 'wb') as f:
    pickle.dump(feature_info, f)
print(f"✅ Saved feature metadata: ../data/feature_info.pkl")

### =============================================================================
### PART 13: FEATURE RECOMMENDATIONS
### =============================================================================

print("\n" + "=" * 70)
print("PART 13: FEATURE SELECTION RECOMMENDATIONS")
print("=" * 70)

print("\n📋 Recommended Feature Sets:")

print("\n1️⃣  MINIMAL SET (Top 10 - Quick Model):")
minimal_features = correlations_sorted.head(10).index.tolist()
for i, f in enumerate(minimal_features, 1):
    print(f"   {i:2d}. {f}")

print("\n2️⃣  STANDARD SET (Top 20 - Balanced):")
standard_features = correlations_sorted.head(20).index.tolist()
print(f"   {len(standard_features)} features including engagement, health, and risk metrics")

print("\n3️⃣  COMPREHENSIVE SET (All Numeric - Maximum Performance):")
print(f"   {len(modeling_features)} features - all engineered numeric features")

print("\n4️⃣  INTERPRETABLE SET (Easy to Explain):")
interpretable = [
    'health_score', 'engagement_score', 'high_risk_flag',
    'days_since_last_login', 'logins_30d', 'support_tickets_30d',
    'net_promoter_score', 'usage_vs_plan', 'payment_failures', 'tenure_days'
]
for i, f in enumerate(interpretable, 1):
    print(f"   {i:2d}. {f}")

### =============================================================================
### SUMMARY
### =============================================================================

print("\n" + "=" * 70)
print("✅ FEATURE ENGINEERING COMPLETE!")
print("=" * 70)

print(f"""
Feature Engineering Summary:
  • Original features: {len(original_features)}
  • New features created: {len(new_features)}
  • Total features: {len(df.columns)}
  • Numeric features for modeling: {len(modeling_features)}
  
Feature Categories Created:
  • Engagement metrics: {len(engagement_features)}
  • Health & risk indicators: {len(health_features)}
  • Support & satisfaction: {len(support_features)}
  • Value metrics: {len(value_features)}
  • Interaction features: {len(interaction_features)}
  
Top Predictors Identified:
  1. {correlations_sorted.index[0]}: {correlations_sorted.values[0]:.3f}
  2. {correlations_sorted.index[1]}: {correlations_sorted.values[1]:.3f}
  3. {correlations_sorted.index[2]}: {correlations_sorted.values[2]:.3f}
  
Files Created:
  1. customer_churn_engineered.csv (full dataset)
  2. feature_info.pkl (feature metadata)
  3. feature_engineering_analysis.png (9 visualizations)
""")

print("\n🚀 Ready for Model Training!")
print("=" * 70)
print("PART 1: LOADING DATA")
print("=" * 70)

df = pd.read_csv('../Datasets/customer_churn_data.csv')
print(f"✅ Loaded {len(df):,} customer records")
print(f"   Original features: {len(df.columns)}")

FEATURE ENGINEERING FOR CHURN PREDICTION

PART 1: LOADING DATA
✅ Loaded 5,000 customer records
   Original features: 18
Features engineered: (5000, 35)
  customer_id signup_date  tenure_days subscription_tier  \
0  cust_00000  2025-04-22          132             basic   
1  cust_00001  2025-05-03          121              free   
2  cust_00002  2024-07-19          409              free   
3  cust_00003  2024-01-27          583              free   
4  cust_00004  2023-11-09          662             basic   

   monthly_reoccuring_revenue company_size industry  logins_30d  \
0                          49         1-10   retail          23   
1                           0         1-10  finance          21   
2                           0         1-10     tech          19   
3                           0         1-10    other          23   
4                          49       51-200     tech          15   

   session_duration_avg  features_used  ...  sentiment_risk  \
0                  33