In [71]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings('ignore')

In [72]:
df = pd.read_csv('resturent.csv')


In [None]:
class RestaurantFeatureEngineer:
    def __init__(self, df):
        self.df = df.copy()
        self.vader = SentimentIntensityAnalyzer()
        
    def prepare_data(self):
        """Basic data preparation"""
        # Convert dates
        self.df['date'] = pd.to_datetime(self.df['date'])
        self.df['yelping_since'] = pd.to_datetime(self.df['yelping_since'])
        
        # Parse JSON-like columns (assuming they're stored as strings)
        self._parse_json_columns()
        
        return self.df
    
    def _parse_json_columns(self):
        """Parse JSON-like columns if they exist"""
        # This is a placeholder method
        # Add specific JSON parsing logic if needed for your data
        pass
    
    def create_temporal_features(self):
        """Create time-based features"""
        print("üïê Engineering temporal features...")
        
        # Business age features
        self.df['business_age_days'] = (self.df['date'].max() - self.df['yelping_since']).dt.days
        self.df['user_tenure_days'] = (self.df['date'].max() - self.df['yelping_since']).dt.days
        
        # Review timing features
        self.df['review_day_of_week'] = self.df['date'].dt.dayofweek
        self.df['review_month'] = self.df['date'].dt.month
        self.df['review_quarter'] = self.df['date'].dt.quarter
        self.df['is_weekend'] = self.df['review_day_of_week'].isin([5, 6]).astype(int)
        
        # Time since last review (per business)
        self.df['days_since_last_review'] = self.df.groupby('business_id')['date'].diff().dt.days
        
        # Seasonal features
        self.df['is_holiday_season'] = self.df['review_month'].isin([11, 12]).astype(int)
        self.df['is_summer'] = self.df['review_month'].isin([6, 7, 8]).astype(int)
        
        return self.df
    
    def create_trend_features(self):
        """Create moving averages and trend indicators"""
        print("üìà Creating trend features...")
        
        # Sort by date first
        self.df = self.df.sort_values(['business_id', 'date']).reset_index(drop=True)
        
        # Rolling averages using transform to avoid merge issues
        self.df['rolling_30d_stars_user'] = self.df.groupby('business_id')['stars_user'].transform(
            lambda x: x.rolling(window=30, min_periods=1).mean()
        )
        self.df['rolling_30d_useful_review'] = self.df.groupby('business_id')['useful_review'].transform(
            lambda x: x.rolling(window=30, min_periods=1).sum()
        )
        self.df['rolling_30d_cool_review'] = self.df.groupby('business_id')['cool_review'].transform(
            lambda x: x.rolling(window=30, min_periods=1).sum()
        )
        self.df['rolling_30d_funny_review'] = self.df.groupby('business_id')['funny_review'].transform(
            lambda x: x.rolling(window=30, min_periods=1).sum()
        )
        
        # Growth rates
        self.df['review_growth_30d'] = self.df.groupby('business_id')['review_count_business'].pct_change(periods=30)
        
        return self.df
    
    def create_user_engagement_features(self):
        """Create user behavior and engagement features"""
        print("üë• Engineering user engagement features...")
        
        # User influence score (composite metric)
        self.df['user_influence_score'] = (
            np.log1p(self.df['review_count_user']) * 0.3 +
            np.log1p(self.df['fans']) * 0.3 +
            np.log1p(self.df['useful_user'] + self.df['funny_user'] + self.df['cool_user']) * 0.2 +
            (self.df['average_stars'] / 5) * 0.2
        )
        
        # Elite user classification
        compliment_columns = [col for col in self.df.columns if 'compliment' in col]
        self.df['total_compliments'] = self.df[compliment_columns].sum(axis=1)
        self.df['is_elite_user'] = (
            (self.df['review_count_user'] > self.df['review_count_user'].quantile(0.75)) |
            (self.df['fans'] > self.df['fans'].quantile(0.75)) |
            (self.df['total_compliments'] > self.df['total_compliments'].quantile(0.75))
        ).astype(int)
        
        # Friend network features
        self.df['friends_count'] = self.df['friends'].apply(
            lambda x: len(str(x).split(',')) if pd.notna(x) and x != 'None' else 0
        )
        self.df['has_friends'] = (self.df['friends_count'] > 0).astype(int)
        
        # User engagement diversity
        self.df['engagement_diversity'] = (
            self.df['useful_user'] + self.df['funny_user'] + self.df['cool_user']
        ) / np.maximum(1, self.df['review_count_user'])
        
        return self.df
    
    def create_sentiment_momentum(self):
        """Calculate sentiment trends over time"""
        print("üìä Creating sentiment momentum features...")
        
        # First, we need to calculate sentiment (will be done in NLP section)
        # This is a placeholder for the momentum calculation
        self.df = self.df.sort_values(['business_id', 'date'])
        
        # Sentiment rolling averages (assuming 'review_sentiment' exists)
        if 'review_sentiment' in self.df.columns:
            self.df['sentiment_rolling_7d'] = self.df.groupby('business_id')['review_sentiment'].transform(
                lambda x: x.rolling(7, min_periods=1).mean()
            )
            self.df['sentiment_trend'] = self.df.groupby('business_id')['review_sentiment'].transform(
                lambda x: x.rolling(14, min_periods=2).apply(
                    lambda y: np.polyfit(range(len(y)), y, 1)[0] if len(y) > 1 else 0
                )
            )
        
        return self.df
    
    def create_nlp_features(self):
        """Create comprehensive NLP-based features from reviews"""
        print("üìù Engineering NLP features...")
        
        # Basic text statistics
        self.df['review_length'] = self.df['review'].str.len()
        self.df['word_count'] = self.df['review'].str.split().str.len()
        self.df['avg_word_length'] = self.df['review_length'] / np.maximum(1, self.df['word_count'])
        self.df['exclamation_count'] = self.df['review'].str.count('!')
        self.df['question_count'] = self.df['review'].str.count('\?')
        
        # Sentiment Analysis with multiple methods
        self._calculate_sentiment_scores()
        
        # Topic Modeling features
        self._extract_topic_features()
        
        # Keyword-based features
        self._extract_keyword_features()
        
        # Readability and complexity
        self._calculate_readability_scores()
        
        return self.df
    
    def _calculate_sentiment_scores(self):
        """Calculate multiple sentiment scores"""
        print("   üé≠ Calculating sentiment scores...")
        
        # VADER Sentiment (optimized for social media)
        sentiment_scores = self.df['review'].apply(
            lambda x: self.vader.polarity_scores(str(x))
        )
        self.df['vader_compound'] = sentiment_scores.apply(lambda x: x['compound'])
        self.df['vader_positive'] = sentiment_scores.apply(lambda x: x['pos'])
        self.df['vader_negative'] = sentiment_scores.apply(lambda x: x['neg'])
        self.df['vader_neutral'] = sentiment_scores.apply(lambda x: x['neu'])
        
        # TextBlob Sentiment
        self.df['textblob_polarity'] = self.df['review'].apply(
            lambda x: TextBlob(str(x)).sentiment.polarity
        )
        self.df['textblob_subjectivity'] = self.df['review'].apply(
            lambda x: TextBlob(str(x)).sentiment.subjectivity
        )
        
        # Combined sentiment score
        self.df['review_sentiment'] = (
            self.df['vader_compound'] * 0.6 + 
            self.df['textblob_polarity'] * 0.4
        )
    
    def _extract_topic_features(self):
        """Extract topic modeling features"""
        print("   üóÇÔ∏è Extracting topic features...")
        
        # Simple topic categories based on keywords
        topic_keywords = {
            'food_quality': ['delicious', 'tasty', 'flavor', 'fresh', 'quality', 'cooked'],
            'service': ['service', 'waitress', 'waiter', 'staff', 'friendly', 'attentive'],
            'ambiance': ['atmosphere', 'ambiance', 'decor', 'music', 'lighting', 'clean'],
            'price': ['price', 'expensive', 'cheap', 'worth', 'value', 'affordable'],
            'wait_time': ['wait', 'busy', 'crowded', 'reservation', 'line', 'queue']
        }
        
        for topic, keywords in topic_keywords.items():
            pattern = '|'.join(keywords)
            self.df[f'topic_{topic}'] = self.df['review'].str.lower().str.count(pattern)
            self.df[f'topic_{topic}_ratio'] = self.df[f'topic_{topic}'] / np.maximum(1, self.df['word_count'])
    
    def _extract_keyword_features(self):
        """Extract specific business-related keywords"""
        print("   üîç Extracting keyword features...")
        
        keyword_categories = {
            'positive_food': ['amazing', 'delicious', 'perfect', 'best', 'great', 'excellent', 'fantastic'],
            'negative_food': ['bad', 'terrible', 'awful', 'disgusting', 'overcooked', 'undercooked'],
            'service_positive': ['friendly', 'attentive', 'prompt', 'helpful', 'knowledgeable'],
            'service_negative': ['rude', 'slow', 'ignored', 'unprofessional', 'inattentive'],
            'recommendation': ['recommend', 'suggest', 'try', 'must have', 'favorite']
        }
        
        for category, keywords in keyword_categories.items():
            pattern = '|'.join(keywords)
            self.df[f'keyword_{category}'] = self.df['review'].str.lower().str.count(pattern)
    
    def _calculate_readability_scores(self):
        """Calculate text complexity scores"""
        print("   üìö Calculating readability scores...")
        
        # Simple readability proxy (higher = more complex)
        self.df['readability_score'] = (
            self.df['avg_word_length'] * 0.4 +
            (self.df['word_count'] / 100) * 0.3 +
            (self.df['sentence_count'] / 10) * 0.3
            if 'sentence_count' in self.df.columns else 0
        )
        
        # Emoji and special character analysis
        emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "]+", flags=re.UNICODE
        )
        
        self.df['emoji_count'] = self.df['review'].apply(
            lambda x: len(emoji_pattern.findall(str(x)))
        )
    
    def create_business_health_indicators(self):
        """Create comprehensive business health metrics"""
        print("üè¢ Engineering business health indicators...")
        
        # Review velocity and trends
        self.df['reviews_per_day'] = self.df.groupby('business_id')['date'].transform(
            lambda x: x.count() / (x.max() - x.min()).days if (x.max() - x.min()).days > 0 else 0
        )
        
        # Star rating volatility (business consistency)
        self.df['rating_volatility'] = self.df.groupby('business_id')['stars_user'].transform('std')
        
        # Response rate to negative reviews (improved version)
        negative_reviews = self.df[self.df['stars_user'] <= 2].groupby('business_id').size()
        total_reviews = self.df.groupby('business_id').size()
        self.df['negative_response_rate'] = (negative_reviews / total_reviews).fillna(0)
        
        # Checkin/review ratio
        self.df['checkin_review_ratio'] = self.df['checkin_count'] / np.maximum(1, self.df['review_count_business'])
        
        # Tip engagement rate
        self.df['tip_engagement_rate'] = self.df['tip_count'] / np.maximum(1, self.df['review_count_business'])
        
        # Compliment diversity score
        compliment_cols = [col for col in self.df.columns if 'compliment_' in col]
        if compliment_cols:
            self.df['compliment_diversity'] = self.df[compliment_cols].std(axis=1) / np.maximum(1, self.df[compliment_cols].mean(axis=1))
        
        # NEW: Review sentiment volatility
        if 'review_sentiment' in self.df.columns:
            self.df['sentiment_volatility'] = self.df.groupby('business_id')['review_sentiment'].transform('std')
        
        # NEW: User retention rate (simplified)
        user_repeat_visits = self.df.groupby('user_id')['business_id'].nunique()
        self.df['user_retention_proxy'] = self.df['user_id'].map(
            lambda x: 1 if user_repeat_visits.get(x, 0) > 1 else 0
        )
        
        # Engagement ratio (used in feature interactions)
        self.df['engagement_ratio'] = (
            self.df['useful_review'] + self.df['funny_review'] + self.df['cool_review']
        ) / np.maximum(1, self.df['review_count_user'])
        
        return self.df
    
    def create_competitive_features(self):
        """Create market and competitive landscape features"""
        print("üè™ Engineering competitive features...")
        
        # Category-based features
        if 'categories' in self.df.columns:
            self.df['category_count'] = self.df['categories'].str.split(',').str.len()
            
            # Category saturation (how many businesses in same category)
            category_counts = self.df['categories'].value_counts()
            self.df['category_saturation'] = self.df['categories'].map(category_counts)
            
            # NEW: Category diversity (unique categories per business)
            self.df['category_diversity'] = self.df['categories'].apply(
                lambda x: len(set(str(x).split(','))) if pd.notna(x) else 0
            )
        
        # Market position indicators
        business_stats = self.df.groupby('business_id').agg({
            'stars_business': 'first',
            'review_count_business': 'first'
        })
        
        # Rating percentile
        business_stats['rating_percentile'] = business_stats['stars_business'].rank(pct=True)
        business_stats['review_count_percentile'] = business_stats['review_count_business'].rank(pct=True)
        
        # Map back to main dataframe
        self.df['rating_percentile'] = self.df['business_id'].map(business_stats['rating_percentile'])
        self.df['review_count_percentile'] = self.df['business_id'].map(business_stats['review_count_percentile'])
        
        # NEW: Competitive pressure score
        self.df['competitive_pressure'] = (
            (1 - self.df['rating_percentile']) * 0.6 + 
            (1 - self.df['review_count_percentile']) * 0.4
        )
        
        # Unique attribute combinations
        attribute_columns = [col for col in self.df.columns if 'attribute' in col.lower()]
        if attribute_columns:
            self.df['unique_attributes_count'] = self.df[attribute_columns].notna().sum(axis=1)
            self.df['attribute_diversity'] = self.df[attribute_columns].nunique(axis=1)
        
        return self.df
    
    def create_graph_features(self):
        """Create social graph and network features"""
        print("üï∏Ô∏è Engineering graph features...")
        
        # User network strength (based on friends)
        self.df['network_strength'] = np.log1p(self.df['friends_count']) * np.log1p(self.df['fans'])
        
        # User centrality proxy
        high_influence_friends_threshold = self.df['friends_count'].quantile(0.75)
        self.df['high_influence_friends_ratio'] = (
            self.df['friends_count'] > high_influence_friends_threshold
        ).astype(int)
        
        # Business co-visitation pattern
        user_business_count = self.df.groupby('user_id')['business_id'].nunique()
        self.df['user_exploration_score'] = self.df['user_id'].map(user_business_count)
        
        # NEW: User network density (simplified)
        self.df['network_density'] = self.df['friends_count'] / np.maximum(1, self.df['fans'])
        
        # NEW: Business similarity score (based on categories)
        if 'categories' in self.df.columns:
            # Create a simple similarity proxy - businesses with same categories are "connected"
            category_business_count = self.df.groupby('categories')['business_id'].nunique()
            self.df['business_similarity_network'] = self.df['categories'].map(category_business_count)
        
        # NEW: Friend recommendation strength (proxy)
        self.df['friend_recommendation_strength'] = (
            self.df['friends_count'] * self.df['user_influence_score']
        )
        
        return self.df
    
    def create_feature_interactions(self):
        """Create interaction features between important variables"""
        print("‚ö° Creating feature interactions...")
        
        # User-Business interaction features
        self.df['user_business_affinity'] = (
            self.df['user_influence_score'] * self.df['rating_percentile']
        )
        
        # Sentiment-Rating discrepancy
        self.df['sentiment_rating_gap'] = (
            self.df['review_sentiment'] - (self.df['stars_user'] / 5)
        )
        
        # Engagement-Sentiment interaction
        self.df['high_engagement_positive'] = (
            (self.df['engagement_ratio'] > self.df['engagement_ratio'].quantile(0.75)) &
            (self.df['review_sentiment'] > 0)
        ).astype(int)
        
        # Time-Engagement interactions
        self.df['weekend_engagement'] = self.df['is_weekend'] * self.df['engagement_ratio']
        
        return self.df
    
    def execute_full_feature_engineering(self):
        """Execute the complete feature engineering pipeline"""
        print("üöÄ Starting Complete Feature Engineering Pipeline...")
        print("=" * 60)
        
        # Step 1: Data Preparation
        self.prepare_data()
        
        # Step 2: Temporal Features
        self.create_temporal_features()
        
        # Step 3: User Engagement Features
        self.create_user_engagement_features()
        
        # Step 4: NLP Features
        self.create_nlp_features()
        
        # Step 5: Business Health Indicators
        self.create_business_health_indicators()
        
        # Step 6: Competitive Features
        self.create_competitive_features()
        
        # Step 7: Graph Features
        self.create_graph_features()
        
        # Step 8: Feature Interactions
        self.create_feature_interactions()
        
        # Step 9: Trend Features
        self.create_trend_features()
        
        print("=" * 60)
        print("‚úÖ Feature Engineering Complete!")
        print(f"üìä Original features: {37}")
        print(f"üìà New features created: {len(self.df.columns) - 37}")
        print(f"üéØ Total features: {len(self.df.columns)}")
        
        return self.df
    
    def get_feature_summary(self):
        """Get summary of engineered features"""
        feature_categories = {
            'Temporal Features': [col for col in self.df.columns if any(x in col for x in 
                                ['day', 'month', 'quarter', 'weekend', 'holiday', 'rolling', 'growth'])],
            'User Features': [col for col in self.df.columns if any(x in col for x in 
                              ['influence', 'elite', 'friends', 'engagement', 'compliment'])],
            'NLP Features': [col for col in self.df.columns if any(x in col for x in 
                             ['sentiment', 'topic_', 'keyword_', 'vader', 'textblob', 'readability'])],
            'Business Features': [col for col in self.df.columns if any(x in col for x in 
                                  ['volatility', 'health', 'ratio', 'saturation', 'percentile'])],
            'Interaction Features': [col for col in self.df.columns if any(x in col for x in 
                                     ['affinity', 'gap', 'interaction'])]
        }
        
        for category, features in feature_categories.items():
            print(f"{category}: {len(features)} features")
        
        return feature_categories

In [80]:
# Cell 9: Run the pipeline ONCE and get results
print("üöÄ LAUNCHING FEATURE ENGINEERING PIPELINE...")

# Load your data
df = pd.read_csv('resturent.csv')
print(f"üìÅ Original data shape: {df.shape}")

# Initialize and run pipeline ONCE
feature_engineer = RestaurantFeatureEngineer(df)
enhanced_df = feature_engineer.execute_full_feature_engineering()

print(f"üéØ Enhanced data shape: {enhanced_df.shape}")
print(f"‚ú® New features created: {enhanced_df.shape[1] - df.shape[1]}")

# Save the results
enhanced_df.to_csv('restaurant_data_engineered.csv', index=False)
print("üíæ Saved as 'restaurant_data_engineered.csv'")

üöÄ LAUNCHING FEATURE ENGINEERING PIPELINE...


üìÅ Original data shape: (23555, 104)


AttributeError: 'RestaurantFeatureEngineer' object has no attribute 'execute_full_feature_engineering'

In [None]:
# Cell 10: Complete Insights Analysis
print("üîç COMPREHENSIVE FEATURE ANALYSIS")
print("=" * 50)

# 1. Basic Dataset Info
print("üìä DATASET OVERVIEW:")
print(f"   Original shape: {df.shape}")
print(f"   Enhanced shape: {enhanced_df.shape}")
print(f"   New features: {enhanced_df.shape[1] - df.shape[1]}")

# 2. Feature Categories Breakdown
feature_categories = {
    'Temporal Features': [col for col in enhanced_df.columns if any(x in col for x in 
                            ['day', 'month', 'quarter', 'weekend', 'holiday', 'rolling', 'growth'])],
    'User Features': [col for col in enhanced_df.columns if any(x in col for x in 
                          ['influence', 'elite', 'friends', 'engagement', 'compliment'])],
    'NLP Features': [col for col in enhanced_df.columns if any(x in col for x in 
                         ['sentiment', 'topic_', 'keyword_', 'vader', 'textblob', 'readability'])],
    'Business Health': [col for col in enhanced_df.columns if any(x in col for x in 
                            ['volatility', 'health', 'ratio', 'response', 'retention', 'diversity'])],
    'Competitive': [col for col in enhanced_df.columns if any(x in col for x in 
                      ['saturation', 'percentile', 'pressure', 'competitive', 'attribute'])],
    'Graph Features': [col for col in enhanced_df.columns if any(x in col for x in 
                       ['network', 'exploration', 'similarity', 'recommendation'])]
}

print("\nüìà FEATURE CATEGORIES BREAKDOWN:")
for category, features in feature_categories.items():
    print(f"   {category}: {len(features)} features")

# 3. Key Feature Statistics
print("\nüìä KEY FEATURE STATISTICS:")
key_features = [
    'user_influence_score', 'review_sentiment', 'business_age_days',
    'rating_volatility', 'engagement_ratio', 'rating_percentile',
    'category_saturation', 'network_strength'
]

for feature in key_features:
    if feature in enhanced_df.columns:
        stats = enhanced_df[feature].describe()
        print(f"   {feature}:")
        print(f"      Mean: {stats['mean']:.3f}, Std: {stats['std']:.3f}")
        print(f"      Min: {stats['min']:.3f}, Max: {stats['max']:.3f}")

# 4. Missing Blueprint Features Check
print("\nüîç MISSING BLUEPRINT FEATURES CHECK:")
blueprint_features = {
    'Business Health': ['rating_volatility', 'negative_response_rate', 'checkin_review_ratio', 
                       'tip_engagement_rate', 'compliment_diversity', 'sentiment_volatility'],
    'Competitive': ['category_saturation', 'rating_percentile', 'competitive_pressure', 
                   'unique_attributes_count'],
    'Graph': ['network_strength', 'user_exploration_score', 'friend_recommendation_strength']
}

for category, features in blueprint_features.items():
    missing = [f for f in features if f not in enhanced_df.columns]
    if missing:
        print(f"   ‚ùå {category} missing: {missing}")
    else:
        print(f"   ‚úÖ {category}: All implemented!")

# 5. Sample of Most Important New Features
print("\nüéØ TOP 10 NEW FEATURES (Sample):")
new_features = [col for col in enhanced_df.columns if col not in df.columns]
print([feature for feature in new_features[:10]])

In [None]:
# Cell 11: Get detailed feature summary
print("üìã DETAILED FEATURE SUMMARY")
print("=" * 50)

feature_summary = feature_engineer.get_feature_summary()

# Show all new features
print("\nüîç ALL NEW FEATURES CREATED:")
new_features = [col for col in enhanced_df.columns if col not in df.columns]
print(f"Total new features: {len(new_features)}")
for i, feature in enumerate(new_features, 1):
    print(f"{i:2d}. {feature}")

print(f"\n‚úÖ PIPELINE EXECUTION COMPLETE!")
print(f"üéØ Ready for Phase 3: Preprocessing & Modeling!")

In [68]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import re
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import warnings
warnings.filterwarnings('ignore')

class RestaurantFeatureEngineer:
    def __init__(self, df):
        self.df = df.copy()
        self.vader = SentimentIntensityAnalyzer()
        
    def prepare_data(self):
        """Basic data preparation"""
        # Convert dates
        self.df['date'] = pd.to_datetime(self.df['date'])
        self.df['yelping_since'] = pd.to_datetime(self.df['yelping_since'])
        
        # Parse JSON-like columns (assuming they're stored as strings)
        self._parse_json_columns()
        
        return self.df
    
    def _parse_json_columns(self):
        """Parse JSON-like columns if they exist"""
        # This is a placeholder method
        # Add specific JSON parsing logic if needed for your data
        pass
    
    def create_temporal_features(self):
        """Create time-based features"""
        print("üïê Engineering temporal features...")
        
        # Business age features
        self.df['business_age_days'] = (self.df['date'].max() - self.df['yelping_since']).dt.days
        self.df['user_tenure_days'] = (self.df['date'].max() - self.df['yelping_since']).dt.days
        
        # Review timing features
        self.df['review_day_of_week'] = self.df['date'].dt.dayofweek
        self.df['review_month'] = self.df['date'].dt.month
        self.df['review_quarter'] = self.df['date'].dt.quarter
        self.df['is_weekend'] = self.df['review_day_of_week'].isin([5, 6]).astype(int)
        
        # Time since last review (per business)
        self.df['days_since_last_review'] = self.df.groupby('business_id')['date'].diff().dt.days
        
        # Seasonal features
        self.df['is_holiday_season'] = self.df['review_month'].isin([11, 12]).astype(int)
        self.df['is_summer'] = self.df['review_month'].isin([6, 7, 8]).astype(int)
        
        return self.df
    
    def create_trend_features(self):
        """Create moving averages and trend indicators"""
        print("üìà Creating trend features...")
        
        # Sort by date first
        self.df = self.df.sort_values(['business_id', 'date']).reset_index(drop=True)
        
        # Rolling averages using transform to avoid merge issues
        self.df['rolling_30d_stars_user'] = self.df.groupby('business_id')['stars_user'].transform(
            lambda x: x.rolling(window=30, min_periods=1).mean()
        )
        self.df['rolling_30d_useful_review'] = self.df.groupby('business_id')['useful_review'].transform(
            lambda x: x.rolling(window=30, min_periods=1).sum()
        )
        self.df['rolling_30d_cool_review'] = self.df.groupby('business_id')['cool_review'].transform(
            lambda x: x.rolling(window=30, min_periods=1).sum()
        )
        self.df['rolling_30d_funny_review'] = self.df.groupby('business_id')['funny_review'].transform(
            lambda x: x.rolling(window=30, min_periods=1).sum()
        )
        
        # Growth rates
        self.df['review_growth_30d'] = self.df.groupby('business_id')['review_count_business'].pct_change(periods=30)
        
        return self.df
    
    def create_user_engagement_features(self):
        """Create user behavior and engagement features"""
        print("üë• Engineering user engagement features...")
        
        # User influence score (composite metric)
        self.df['user_influence_score'] = (
            np.log1p(self.df['review_count_user']) * 0.3 +
            np.log1p(self.df['fans']) * 0.3 +
            np.log1p(self.df['useful_user'] + self.df['funny_user'] + self.df['cool_user']) * 0.2 +
            (self.df['average_stars'] / 5) * 0.2
        )
        
        # Elite user classification
        compliment_columns = [col for col in self.df.columns if 'compliment' in col]
        self.df['total_compliments'] = self.df[compliment_columns].sum(axis=1)
        self.df['is_elite_user'] = (
            (self.df['review_count_user'] > self.df['review_count_user'].quantile(0.75)) |
            (self.df['fans'] > self.df['fans'].quantile(0.75)) |
            (self.df['total_compliments'] > self.df['total_compliments'].quantile(0.75))
        ).astype(int)
        
        # Friend network features
        self.df['friends_count'] = self.df['friends'].apply(
            lambda x: len(str(x).split(',')) if pd.notna(x) and x != 'None' else 0
        )
        self.df['has_friends'] = (self.df['friends_count'] > 0).astype(int)
        
        # User engagement diversity
        self.df['engagement_diversity'] = (
            self.df['useful_user'] + self.df['funny_user'] + self.df['cool_user']
        ) / np.maximum(1, self.df['review_count_user'])
        
        return self.df
    
    def create_sentiment_momentum(self):
        """Calculate sentiment trends over time"""
        print("üìä Creating sentiment momentum features...")
        
        # First, we need to calculate sentiment (will be done in NLP section)
        # This is a placeholder for the momentum calculation
        self.df = self.df.sort_values(['business_id', 'date'])
        
        # Sentiment rolling averages (assuming 'review_sentiment' exists)
        if 'review_sentiment' in self.df.columns:
            self.df['sentiment_rolling_7d'] = self.df.groupby('business_id')['review_sentiment'].transform(
                lambda x: x.rolling(7, min_periods=1).mean()
            )
            self.df['sentiment_trend'] = self.df.groupby('business_id')['review_sentiment'].transform(
                lambda x: x.rolling(14, min_periods=2).apply(
                    lambda y: np.polyfit(range(len(y)), y, 1)[0] if len(y) > 1 else 0
                )
            )
        
        return self.df
    
    def create_nlp_features(self):
        """Create comprehensive NLP-based features from reviews"""
        print("üìù Engineering NLP features...")
        
        # Basic text statistics
        self.df['review_length'] = self.df['review'].str.len()
        self.df['word_count'] = self.df['review'].str.split().str.len()
        self.df['avg_word_length'] = self.df['review_length'] / np.maximum(1, self.df['word_count'])
        self.df['exclamation_count'] = self.df['review'].str.count('!')
        self.df['question_count'] = self.df['review'].str.count('\?')
        
        # Sentiment Analysis with multiple methods
        self._calculate_sentiment_scores()
        
        # Topic Modeling features
        self._extract_topic_features()
        
        # Keyword-based features
        self._extract_keyword_features()
        
        # Readability and complexity
        self._calculate_readability_scores()
        
        return self.df
    
    
     
    def _calculate_sentiment_scores(self):
        """Calculate multiple sentiment scores"""
        print("   üé≠ Calculating sentiment scores...")
        
        # VADER Sentiment (optimized for social media)
        sentiment_scores = self.df['review'].apply(
            lambda x: self.vader.polarity_scores(str(x))
        )
        self.df['vader_compound'] = sentiment_scores.apply(lambda x: x['compound'])
        self.df['vader_positive'] = sentiment_scores.apply(lambda x: x['pos'])
        self.df['vader_negative'] = sentiment_scores.apply(lambda x: x['neg'])
        self.df['vader_neutral'] = sentiment_scores.apply(lambda x: x['neu'])
        
        # TextBlob Sentiment
        self.df['textblob_polarity'] = self.df['review'].apply(
            lambda x: TextBlob(str(x)).sentiment.polarity
        )
        self.df['textblob_subjectivity'] = self.df['review'].apply(
            lambda x: TextBlob(str(x)).sentiment.subjectivity
        )
        
        # Combined sentiment score
        self.df['review_sentiment'] = (
            self.df['vader_compound'] * 0.6 + 
            self.df['textblob_polarity'] * 0.4
        )
    
    def _extract_topic_features(self):
        """Extract topic modeling features"""
        print("   üóÇÔ∏è Extracting topic features...")
        
        # Simple topic categories based on keywords
        topic_keywords = {
            'food_quality': ['delicious', 'tasty', 'flavor', 'fresh', 'quality', 'cooked'],
            'service': ['service', 'waitress', 'waiter', 'staff', 'friendly', 'attentive'],
            'ambiance': ['atmosphere', 'ambiance', 'decor', 'music', 'lighting', 'clean'],
            'price': ['price', 'expensive', 'cheap', 'worth', 'value', 'affordable'],
            'wait_time': ['wait', 'busy', 'crowded', 'reservation', 'line', 'queue']
        }
        
        for topic, keywords in topic_keywords.items():
            pattern = '|'.join(keywords)
            self.df[f'topic_{topic}'] = self.df['review'].str.lower().str.count(pattern)
            self.df[f'topic_{topic}_ratio'] = self.df[f'topic_{topic}'] / np.maximum(1, self.df['word_count'])
    
    def _extract_keyword_features(self):
        """Extract specific business-related keywords"""
        print("   üîç Extracting keyword features...")
        
        keyword_categories = {
            'positive_food': ['amazing', 'delicious', 'perfect', 'best', 'great', 'excellent', 'fantastic'],
            'negative_food': ['bad', 'terrible', 'awful', 'disgusting', 'overcooked', 'undercooked'],
            'service_positive': ['friendly', 'attentive', 'prompt', 'helpful', 'knowledgeable'],
            'service_negative': ['rude', 'slow', 'ignored', 'unprofessional', 'inattentive'],
            'recommendation': ['recommend', 'suggest', 'try', 'must have', 'favorite']
        }
        
        for category, keywords in keyword_categories.items():
            pattern = '|'.join(keywords)
            self.df[f'keyword_{category}'] = self.df['review'].str.lower().str.count(pattern)
    
    def _calculate_readability_scores(self):
        """Calculate text complexity scores"""
        print("   üìö Calculating readability scores...")
        
        # Simple readability proxy (higher = more complex)
        self.df['readability_score'] = (
            self.df['avg_word_length'] * 0.4 +
            (self.df['word_count'] / 100) * 0.3 +
            (self.df['sentence_count'] / 10) * 0.3
            if 'sentence_count' in self.df.columns else 0
        )
        
        # Emoji and special character analysis
        emoji_pattern = re.compile(
            "["
            "\U0001F600-\U0001F64F"  # emoticons
            "\U0001F300-\U0001F5FF"  # symbols & pictographs
            "\U0001F680-\U0001F6FF"  # transport & map symbols
            "\U0001F1E0-\U0001F1FF"  # flags (iOS)
            "]+", flags=re.UNICODE
        )
        
        self.df['emoji_count'] = self.df['review'].apply(
            lambda x: len(emoji_pattern.findall(str(x)))
        )
    
    def create_business_health_indicators(self):
        """Create comprehensive business health metrics"""
        print("üè¢ Engineering business health indicators...")
        
        # Review velocity and trends
        self.df['reviews_per_day'] = self.df.groupby('business_id')['date'].transform(
            lambda x: x.count() / (x.max() - x.min()).days if (x.max() - x.min()).days > 0 else 0
        )
        
        # Rating volatility (business consistency)
        self.df['rating_volatility'] = self.df.groupby('business_id')['stars_user'].transform('std')
        
        # Engagement ratios
        self.df['engagement_ratio'] = (
            self.df['useful_review'] + self.df['funny_review'] + self.df['cool_review']
        ) / np.maximum(1, self.df['review_count_user'])
        
        # Check-in to review ratio
        self.df['checkin_review_ratio'] = self.df['checkin_count'] / np.maximum(1, self.df['review_count_business'])
        
        # Tip engagement metrics
        self.df['tip_engagement'] = self.df['tip_count'] / np.maximum(1, self.df['review_count_business'])
        
        # Compliment diversity score
        compliment_cols = [col for col in self.df.columns if 'compliment_' in col]
        if compliment_cols:
            self.df['compliment_diversity'] = self.df[compliment_cols].std(axis=1) / np.maximum(1, self.df[compliment_cols].mean(axis=1))
        
        # Response rate proxy (assuming negative reviews get responses)
        negative_reviews = self.df[self.df['stars_user'] <= 2].groupby('business_id').size()
        total_reviews = self.df.groupby('business_id').size()
        response_rate = (negative_reviews / total_reviews).fillna(0)
        self.df['response_rate_proxy'] = self.df['business_id'].map(response_rate)
        
        return self.df
    
    def create_competitive_features(self):
        """Create market and competitive landscape features"""
        print("üè™ Engineering competitive features...")
        
        # Category-based features
        if 'categories' in self.df.columns:
            self.df['category_count'] = self.df['categories'].str.split(',').str.len()
            
            # Calculate category popularity (how many businesses in same category)
            category_counts = self.df.groupby('categories').size()
            self.df['category_saturation'] = self.df['categories'].map(category_counts)
        
        # Market position indicators
        business_avg_rating = self.df.groupby('business_id')['stars_business'].first()
        self.df['rating_percentile'] = self.df['business_id'].map(
            business_avg_rating.rank(pct=True)
        )
        
        # Unique selling proposition score
        attribute_columns = [col for col in self.df.columns if 'attribute' in col.lower()]
        if attribute_columns:
            self.df['unique_attributes_count'] = self.df[attribute_columns].notna().sum(axis=1)
        
        return self.df
    
    def create_graph_features(self):
        """Create social graph and network features"""
        print("üï∏Ô∏è Engineering graph features...")
        
        # User network strength (based on friends)
        self.df['network_strength'] = np.log1p(self.df['friends_count']) * np.log1p(self.df['fans'])
        
        # User centrality proxy (users with many friends who also have many friends)
        # This is simplified - in production you'd use NetworkX
        high_influence_friends_threshold = self.df['friends_count'].quantile(0.75)
        self.df['high_influence_friends_ratio'] = (
            self.df['friends_count'] > high_influence_friends_threshold
        ).astype(int)
        
        # Business co-visitation pattern (simplified)
        user_business_count = self.df.groupby('user_id')['business_id'].nunique()
        self.df['user_exploration_score'] = self.df['user_id'].map(user_business_count)
        
        return self.df
    
    def create_feature_interactions(self):
        """Create interaction features between important variables"""
        print("‚ö° Creating feature interactions...")
        
        # User-Business interaction features
        self.df['user_business_affinity'] = (
            self.df['user_influence_score'] * self.df['rating_percentile']
        )
        
        # Sentiment-Rating discrepancy
        self.df['sentiment_rating_gap'] = (
            self.df['review_sentiment'] - (self.df['stars_user'] / 5)
        )
        
        # Engagement-Sentiment interaction
        self.df['high_engagement_positive'] = (
            (self.df['engagement_ratio'] > self.df['engagement_ratio'].quantile(0.75)) &
            (self.df['review_sentiment'] > 0)
        ).astype(int)
        
        # Time-Engagement interactions
        self.df['weekend_engagement'] = self.df['is_weekend'] * self.df['engagement_ratio']
        
        return self.df
    
    def execute_full_feature_engineering(self):
        """Execute the complete feature engineering pipeline"""
        print("üöÄ Starting Complete Feature Engineering Pipeline...")
        print("=" * 60)
        
        # Step 1: Data Preparation
        self.prepare_data()
        
        # Step 2: Temporal Features
        self.create_temporal_features()
        
        # Step 3: User Engagement Features
        self.create_user_engagement_features()
        
        # Step 4: NLP Features
        self.create_nlp_features()
        
        # Step 5: Business Health Indicators
        self.create_business_health_indicators()
        
        # Step 6: Competitive Features
        self.create_competitive_features()
        
        # Step 7: Graph Features
        self.create_graph_features()
        
        # Step 8: Feature Interactions
        self.create_feature_interactions()
        
        # Step 9: Trend Features
        self.create_trend_features()
        
        print("=" * 60)
        print("‚úÖ Feature Engineering Complete!")
        print(f"üìä Original features: {37}")
        print(f"üìà New features created: {len(self.df.columns) - 37}")
        print(f"üéØ Total features: {len(self.df.columns)}")
        
        return self.df
    
    def get_feature_summary(self):
        """Get summary of engineered features"""
        feature_categories = {
            'Temporal Features': [col for col in self.df.columns if any(x in col for x in 
                                ['day', 'month', 'quarter', 'weekend', 'holiday', 'rolling', 'growth'])],
            'User Features': [col for col in self.df.columns if any(x in col for x in 
                              ['influence', 'elite', 'friends', 'engagement', 'compliment'])],
            'NLP Features': [col for col in self.df.columns if any(x in col for x in 
                             ['sentiment', 'topic_', 'keyword_', 'vader', 'textblob', 'readability'])],
            'Business Features': [col for col in self.df.columns if any(x in col for x in 
                                  ['volatility', 'health', 'ratio', 'saturation', 'percentile'])],
            'Interaction Features': [col for col in self.df.columns if any(x in col for x in 
                                     ['affinity', 'gap', 'interaction'])]
        }
        
        for category, features in feature_categories.items():
            print(f"{category}: {len(features)} features")
        
        return feature_categories

In [69]:
import pandas as pd

In [70]:
# How to use the complete pipeline
def run_feature_engineering_pipeline():
    # Load your data
    df = pd.read_csv('resturent.csv')
    
    # Initialize the feature engineer
    feature_engineer = RestaurantFeatureEngineer(df)
    
    # Execute complete pipeline
    enhanced_df = feature_engineer.execute_full_feature_engineering()
    
    # Get feature summary
    feature_summary = feature_engineer.get_feature_summary()
    
    # Save enhanced dataset
    enhanced_df.to_csv('resturent.csv', index=False)
    
    return enhanced_df

# Run the pipeline
final_df = run_feature_engineering_pipeline()

üöÄ Starting Complete Feature Engineering Pipeline...
üïê Engineering temporal features...
üë• Engineering user engagement features...
üìù Engineering NLP features...
üìù Engineering NLP features...
   üé≠ Calculating sentiment scores...
   üé≠ Calculating sentiment scores...
   üóÇÔ∏è Extracting topic features...
   üóÇÔ∏è Extracting topic features...
   üîç Extracting keyword features...
   üîç Extracting keyword features...
   üìö Calculating readability scores...
   üìö Calculating readability scores...
üè¢ Engineering business health indicators...
üè¢ Engineering business health indicators...
üè™ Engineering competitive features...
üï∏Ô∏è Engineering graph features...
‚ö° Creating feature interactions...
üìà Creating trend features...
üè™ Engineering competitive features...
üï∏Ô∏è Engineering graph features...
‚ö° Creating feature interactions...
üìà Creating trend features...
‚úÖ Feature Engineering Complete!
üìä Original features: 37
üìà New features creat

In [None]:
# Cell 11: Initialize the feature engineer and run the complete pipeline
print("üöÄ INITIALIZING FEATURE ENGINEERING PIPELINE...")

# Load your data
df = pd.read_csv('resturent.csv')
feature_engineer = RestaurantFeatureEngineer(df)

# Execute the full pipeline
enhanced_df = feature_engineer.execute_full_feature_engineering()

print("\n‚úÖ PIPELINE EXECUTION COMPLETE!")
print(f"üìä Original dataset shape: {df.shape}")
print(f"üìà Enhanced dataset shape: {enhanced_df.shape}")
print(f"üéØ Total new features created: {enhanced_df.shape[1] - df.shape[1]}")

üöÄ INITIALIZING FEATURE ENGINEERING PIPELINE...
üöÄ Starting Complete Feature Engineering Pipeline...
üïê Engineering temporal features...
üë• Engineering user engagement features...
üìù Engineering NLP features...
üöÄ Starting Complete Feature Engineering Pipeline...
üïê Engineering temporal features...
üë• Engineering user engagement features...
üìù Engineering NLP features...
   üé≠ Calculating sentiment scores...
   üé≠ Calculating sentiment scores...
   üóÇÔ∏è Extracting topic features...
   üóÇÔ∏è Extracting topic features...
   üîç Extracting keyword features...
   üîç Extracting keyword features...
   üìö Calculating readability scores...
   üìö Calculating readability scores...
üè¢ Engineering business health indicators...
üè¢ Engineering business health indicators...
üè™ Engineering competitive features...
üï∏Ô∏è Engineering graph features...
‚ö° Creating feature interactions...
üìà Creating trend features...
üè™ Engineering competitive features...
üï

In [None]:
df.head()

Unnamed: 0,user_id,business_id,stars_user,useful_review,funny_review,cool_review,review,date,name_user,review_count_user,...,user_exploration_score,user_business_affinity,sentiment_rating_gap,high_engagement_positive,weekend_engagement,rolling_30d_stars_user,rolling_30d_useful_review,rolling_30d_cool_review,rolling_30d_funny_review,review_growth_30d
0,cb4-kLOBtPmh1GGNT8ZtTg,--ZVrH2X2QXBFdCilbirsw,5.0,0,0,0,Delicious. FRESH! Good prices. Now my one and...,2016-02-12 21:40:09,RICK,5,...,1,0.803743,-0.29552,0,0.0,5.0,0.0,0.0,0.0,
1,Yy8JcvtMoNajJJW7k-y4MA,--ZVrH2X2QXBFdCilbirsw,5.0,1,0,0,These are the best hoagies and pizza in Ardmor...,2017-07-31 16:56:21,Brad,63,...,1,2.414928,-0.241451,0,0.0,5.0,1.0,0.0,0.0,
2,7iCjHZY74yCEF-Eajx5sIA,--ZVrH2X2QXBFdCilbirsw,5.0,0,0,0,This place is sadly perm closed. I was hoping ...,2018-02-24 00:53:41,John,25,...,1,1.603088,-1.12,0,0.0,5.0,1.0,0.0,0.0,
3,HpDFWnX-56Dpgmsw4ddohA,-0Ym1Wg3bXd_TDz8JtvOQg,4.0,0,0,0,Cute little shop with unique ice cream flavour...,2018-08-24 22:43:03,Janna,147,...,1,2.904498,-0.262773,0,0.0,4.0,0.0,0.0,0.0,
4,U-dNFjVZ907wxEFiOElOmw,-0fvhILrC9UsQ6gLNpZlTQ,5.0,0,0,0,Everyone was so sweet the second we walked in....,2018-03-18 21:41:13,Allisun,2,...,1,0.478876,-0.244075,0,0.0,5.0,0.0,0.0,0.0,


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23555 entries, 0 to 23554
Columns: 104 entries, user_id to review_growth_30d
dtypes: float64(40), int64(52), object(12)
memory usage: 18.7+ MB
