In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from textblob import TextBlob
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Set plotting style

sns.set_palette("husl")

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

import nltk



In [None]:
def preprocess_data(file_path):
    """Load and preprocess the reviews data"""
    # Load data
    print("Loading data...")
    df = pd.read_json(file_path, lines=True, compression='gzip')
    
    # Print initial info
    print("\nInitial Dataset Info:")
    print("-" * 50)
    print(f"Total number of reviews: {len(df)}")
    print(f"Number of unique products: {df['asin'].nunique()}")
    
    # Custom stop words specific to Kindle app reviews
    custom_stops = {
        'kindle', 'amazon', 'app', 'book', 'books', 'read', 'reading',
        'reader', 'version', 'using', 'use', 'used', 'one', 'get', 
        'got', 'would', 'could', 'cant', 'cannot', 'im', 'ive',
        'device', 'time', 'like', 'really', 'way', 'even'
    }
    
    def clean_text(text):
        """Clean individual text by removing custom stop words"""
        if isinstance(text, str):
            # Convert to lowercase
            text = text.lower()
            # Replace custom stop words with space
            for word in custom_stops:
                text = text.replace(f' {word} ', ' ')
            # Remove extra spaces
            text = ' '.join(text.split())
            return text
        return ''
    
    # Clean reviewText
    print("\nCleaning review text...")
    df['reviewText'] = df['reviewText'].apply(clean_text)
    
    # Remove rows with empty reviews after cleaning
    df = df.dropna(subset=['reviewText'])
    df = df[df['reviewText'].str.strip() != '']
    
    # Keep essential columns
    df = df[['asin', 'reviewText', 'overall']]
    
    # Get most reviewed product (Kindle app)
    most_reviewed_asin = df['asin'].value_counts().index[0]
    df = df[df['asin'] == most_reviewed_asin].copy()
    
    print(f"\nNumber of reviews after preprocessing: {len(df)}")
    
    # Print sample of cleaned reviews
    print("\nSample of cleaned reviews:")
    print("-" * 50)
    print(df['reviewText'].head())
    
    return df


In [None]:

def perform_sentiment_analysis(df):
    """Analyze sentiment of reviews"""
    def get_sentiment(text):
        return TextBlob(text).sentiment.polarity
    
    def categorize_sentiment(score):
        if score > 0:
            return 'Positive'
        elif score < 0:
            return 'Negative'
        return 'Neutral'
    
    df['sentiment_score'] = df['reviewText'].apply(get_sentiment)
    df['sentiment_category'] = df['sentiment_score'].apply(categorize_sentiment)
    
       
    # Visualize sentiment distribution
    plt.figure(figsize=(12, 6))
    sentiment_counts = df['sentiment_category'].value_counts()
        
    colors = {'Positive': '#2ecc71', 'Negative': '#e74c3c', 'Neutral': '#95a5a6'}
    bars = plt.bar(sentiment_counts.index, sentiment_counts.values,
                    color=[colors[cat] for cat in sentiment_counts.index])
        
        # Add percentage labels
    total = len(df)
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height,
                f'{height/total*100:.1f}%\n({int(height)})',
                ha='center', va='bottom')
        
    plt.title('Sentiment Distribution in Kindle App Reviews', fontsize=14, pad=20)
    plt.xlabel('Sentiment Category', fontsize=12)
    plt.ylabel('Number of Reviews', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Print sentiment statistics
    print("\nSentiment Analysis Summary:")
    print("-" * 50)
    for category in ['Positive', 'Negative', 'Neutral']:
        count = sentiment_counts[category]
        percentage = (count/total) * 100
        avg_score = df[df['sentiment_category'] == category]['sentiment_score'].mean()
        print(f"{category}: {count} reviews ({percentage:.1f}%), Average score: {avg_score:.3f}")
    
    return df

In [None]:
def perform_topic_modeling(df):
    """Perform topic modeling on reviews"""
    def preprocess_text(text):
        lemmatizer = WordNetLemmatizer()
        stop_words = set(stopwords.words('english'))
        custom_stops = {'kindle', 'amazon', 'app', 'book', 'books'}
        stop_words.update(custom_stops)
        
        tokens = word_tokenize(str(text).lower())
        tokens = [lemmatizer.lemmatize(word) for word in tokens 
                 if word.isalpha() and word not in stop_words]
        return ' '.join(tokens)
    
    print("Preprocessing text for topic modeling...")
    processed_docs = df['reviewText'].apply(preprocess_text)
    
    # Create document-term matrix
    vectorizer = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')
    doc_term_matrix = vectorizer.fit_transform(processed_docs)
    
    # Perform LDA
    n_topics = 5
    lda_model = LatentDirichletAllocation(n_components=n_topics,
                                          random_state=42,
                                          max_iter=100)
    lda_output = lda_model.fit_transform(doc_term_matrix)
    
    return lda_model, vectorizer, lda_output

In [None]:
def interpret_topics(lda_model, vectorizer, df):
    """Interpret and visualize topics"""
    feature_names = vectorizer.get_feature_names_out()
    
    topic_keywords_dict = {
        'Installation & Setup': ['download', 'install', 'version', 'window', 'computer', 'pc', 'device', 'update'],
        'Reading Experience': ['read', 'page', 'screen', 'text', 'font', 'size', 'view', 'zoom'],
        'Technical Performance': ['work', 'issue', 'problem', 'slow', 'crash', 'bug', 'fix', 'error'],
        'Features & Usage': ['feature', 'use', 'easy', 'option', 'function', 'library', 'collection'],
        'Customer Support': ['support', 'help', 'service', 'customer', 'contact', 'response', 'update']
    }
    
    topic_labels = {}
    n_words = 10
    
    # Plot topics
    plt.figure(figsize=(15, 10))
    for topic_idx, topic in enumerate(lda_model.components_):
        top_words_idx = topic.argsort()[:-n_words-1:-1]
        top_words = [feature_names[i] for i in top_words_idx]
        top_weights = [topic[i] for i in top_words_idx]
        
        # Determine topic label
        max_overlap = 0
        best_label = f'Topic {topic_idx + 1}'
        for label, keywords in topic_keywords_dict.items():
            overlap = len(set(top_words) & set(keywords))
            if overlap > max_overlap:
                max_overlap = overlap
                best_label = label
        
        topic_labels[topic_idx] = best_label
        
        # Plot topic
        plt.subplot(3, 2, topic_idx + 1)
        plt.bar(range(n_words), top_weights)
        plt.xticks(range(n_words), top_words, rotation=45, ha='right')
        plt.title(f'{best_label}', fontsize=12, pad=10)
        plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
   
    plt.show()
    
    return topic_labels

In [None]:
def analyze_topic_sentiment(df, topic_labels):
    """Analyze sentiment distribution across topics"""
    # Calculate topic-sentiment distribution
    topic_sentiment = pd.crosstab(df['topic_label'], df['sentiment_category'])
    topic_sentiment_pct = topic_sentiment.div(topic_sentiment.sum(axis=1), axis=0) * 100
    
    # Visualize
    plt.figure(figsize=(12, 6))
    colors = {'Positive': '#2ecc71', 'Negative': '#e74c3c', 'Neutral': '#95a5a6'}
    topic_sentiment_pct.plot(kind='bar', stacked=True,
                            color=[colors[cat] for cat in topic_sentiment_pct.columns])
    
    plt.title('Sentiment Distribution Across Topics', fontsize=14, pad=20)
    plt.xlabel('Topics', fontsize=12)
    plt.ylabel('Percentage of Reviews', fontsize=12)
    plt.legend(title='Sentiment')
    plt.xticks(rotation=45, ha='right')
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    # Print detailed analysis
    print("\nTopic-Sentiment Analysis:")
    print("-" * 50)
    for topic in topic_sentiment.index:
        print(f"\n{topic}:")
        print(f"Total reviews: {topic_sentiment.loc[topic].sum()}")
        print("Sentiment distribution:")
        for sentiment in ['Positive', 'Negative', 'Neutral']:
            pct = topic_sentiment_pct.loc[topic, sentiment]
            count = topic_sentiment.loc[topic, sentiment]
            print(f"  {sentiment}: {pct:.1f}% ({count} reviews)")

In [None]:
def generate_wordclouds(df):
    """Generate word clouds for different sentiment categories"""
    for sentiment in ['Positive', 'Negative', 'Neutral']:
        text = ' '.join(df[df['sentiment_category'] == sentiment]['reviewText'])
        
        wordcloud = WordCloud(width=800, height=400,
                            background_color='white',
                            max_words=150,
                            contour_width=3,
                            contour_color='steelblue').generate(text)
        
        plt.figure(figsize=(15, 8))
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title(f'Word Cloud - {sentiment} Reviews', fontsize=14, pad=20)
        plt.show()


In [None]:

    # Load and preprocess data
df = preprocess_data('Software.json.gz')
    
   


In [None]:
 # Perform sentiment analysis
 df = perform_sentiment_analysis(df)

    

In [None]:
 # Perform topic modeling
lda_model, vectorizer, lda_output = perform_topic_modeling(df)
    
   

In [None]:
 # Interpret topics
topic_labels = interpret_topics(lda_model, vectorizer, df)
    
  

In [None]:
# Assign topics to reviews
df['dominant_topic'] = lda_output.argmax(axis=1)
df['topic_label'] = df['dominant_topic'].map(topic_labels)
    


# Analyze topic-sentiment relationship
analyze_topic_sentiment(df, topic_labels)
    
    
    
 


In [None]:
# Generate word clouds
generate_wordclouds(df)

In [None]:
def analyze_key_terms(df):
    """
    Analyze and visualize key terms in reviews with their sentiment context
    """
    # Split reviews into words and get frequency
    words = ' '.join(df['reviewText']).lower().split()
    word_freq = pd.Series(words).value_counts()
    
    # Categorize terms
    feature_terms = ['screen', 'interface', 'library', 'font', 'page', 'search', 'sync', 'bookmark']
    usability_terms = ['easy', 'simple', 'intuitive', 'difficult', 'complicated', 'confusing']
    performance_terms = ['fast', 'slow', 'crash', 'freeze', 'smooth', 'responsive']
    quality_terms = ['great', 'excellent', 'poor', 'terrible', 'awesome', 'bad']
    
    categories = {
        'Product Features': feature_terms,
        'Usability': usability_terms,
        'Performance': performance_terms,
        'Quality': quality_terms
    }
    
    # Create visualization for each category
    plt.figure(figsize=(15, 10))
    
    for idx, (category, terms) in enumerate(categories.items(), 1):
        plt.subplot(2, 2, idx)
        
        # Get frequencies for terms in this category
        cat_freq = word_freq[word_freq.index.isin(terms)]
        if len(cat_freq) > 0:
            # Sort by frequency
            cat_freq = cat_freq.sort_values(ascending=True)
            
            # Create horizontal bar chart
            bars = plt.barh(range(len(cat_freq)), cat_freq.values)
            
            # Add value labels
            for bar in bars:
                width = bar.get_width()
                plt.text(width, bar.get_y() + bar.get_height()/2,
                        f'{int(width):,}',
                        ha='left', va='center', fontsize=10)
            
            # Customize appearance
            plt.yticks(range(len(cat_freq)), cat_freq.index)
            plt.title(f'{category} - Key Terms', pad=20)
            plt.xlabel('Frequency in Reviews')
    
    plt.tight_layout()
    
    plt.show()

    # Create summary table
    print("\nKey Terms Analysis Summary")
    print("-" * 50)
    for category, terms in categories.items():
        print(f"\n{category}:")
        cat_freq = word_freq[word_freq.index.isin(terms)].sort_values(ascending=False)
        for term, freq in cat_freq.items():
            # Calculate percentage of reviews containing this term
            percentage = (freq / len(df)) * 100
            print(f"{term}: {freq:,} mentions ({percentage:.1f}% of reviews)")

def create_sentiment_summary(df):
    """
    Create a clear summary of sentiment patterns
    """
    plt.figure(figsize=(12, 6))
    
    # Calculate sentiment patterns over time
    df['month'] = pd.to_datetime(df['reviewTime']).dt.to_period('M')
    monthly_sentiment = df.groupby('month')['sentiment_category'].value_counts(normalize=True).unstack()
    
    # Create stacked area chart
    ax = monthly_sentiment.plot(kind='area', stacked=True,
                              color=['#2ecc71', '#95a5a6', '#e74c3c'])
    
    plt.title('Sentiment Trends Over Time', pad=20)
    plt.xlabel('Time Period')
    plt.ylabel('Proportion of Reviews')
    plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, alpha=0.3)
    
    plt.tight_layout()
    
    plt.show()

def create_key_metrics_dashboard(df):
    """
    Create a dashboard with key business metrics
    """
    # Calculate key metrics
    total_reviews = len(df)
    avg_rating = df['overall'].mean()
    sentiment_dist = df['sentiment_category'].value_counts(normalize=True)
    
    # Common issues mentioned
    issue_terms = ['crash', 'bug', 'error', 'slow', 'difficult', 'confusing']
    issue_mentions = sum(df['reviewText'].str.contains('|'.join(issue_terms), case=False))
    
    # Positive aspects
    positive_terms = ['great', 'excellent', 'love', 'perfect', 'awesome']
    positive_mentions = sum(df['reviewText'].str.contains('|'.join(positive_terms), case=False))
    
    # Create summary visualization
    plt.figure(figsize=(15, 8))
    
    # Key metrics panel
    plt.subplot(1, 2, 1)
    metrics = [
        f'Total Reviews: {total_reviews:,}',
        f'Average Rating: {avg_rating:.1f}/5.0',
        f'Positive Sentiment: {sentiment_dist["Positive"]*100:.1f}%',
        f'Issue Mentions: {issue_mentions:,}',
        f'Positive Mentions: {positive_mentions:,}'
    ]
    
    plt.axis('off')
    plt.title('Key Performance Metrics', pad=20, fontsize=14)
    for idx, metric in enumerate(metrics):
        plt.text(0.1, 0.8 - (idx * 0.15), metric, fontsize=12)
    
    # Recommendations panel
    plt.subplot(1, 2, 2)
    plt.axis('off')
    plt.title('Key Insights & Recommendations', pad=20, fontsize=14)
    
    recommendations = [
        'Focus Areas:',
        '• Performance optimization',
        '• User interface simplification',
        '• Feature enhancement',
        'Success Metrics:',
        '• User satisfaction trending up',
        '• Decreased error reports'
    ]
    
    for idx, rec in enumerate(recommendations):
        plt.text(0.1, 0.8 - (idx * 0.1), rec, fontsize=12)
    
    plt.tight_layout()
    
    plt.show()

# Use all visualizations
def create_business_analysis(df):
    analyze_key_terms(df)
    create_sentiment_summary(df)
    create_key_metrics_dashboard(df)

In [None]:
def analyze_key_terms(df):
    """
    Analyze and visualize key terms in reviews with their sentiment context
    """
    # Split reviews into words and get frequency
    words = ' '.join(df['reviewText']).lower().split()
    word_freq = pd.Series(words).value_counts()
    
    # Categorize terms
    feature_terms = ['screen', 'interface', 'library', 'font', 'page', 'search', 'sync', 'bookmark']
    usability_terms = ['easy', 'simple', 'intuitive', 'difficult', 'complicated', 'confusing']
    performance_terms = ['fast', 'slow', 'crash', 'freeze', 'smooth', 'responsive']
    quality_terms = ['great', 'excellent', 'poor', 'terrible', 'awesome', 'bad']
    
    categories = {
        'Product Features': feature_terms,
        'Usability': usability_terms,
        'Performance': performance_terms,
        'Quality': quality_terms
    }
    
    # Create visualization for each category
    plt.figure(figsize=(15, 10))
    
    for idx, (category, terms) in enumerate(categories.items(), 1):
        plt.subplot(2, 2, idx)
        
        # Get frequencies for terms in this category
        cat_freq = word_freq[word_freq.index.isin(terms)]
        if len(cat_freq) > 0:
            # Sort by frequency
            cat_freq = cat_freq.sort_values(ascending=True)
            
            # Create horizontal bar chart with custom colors
            bars = plt.barh(range(len(cat_freq)), cat_freq.values, 
                          color=plt.cm.Blues(np.linspace(0.3, 0.9, len(cat_freq))))
            
            # Add value labels
            for bar in bars:
                width = bar.get_width()
                plt.text(width, bar.get_y() + bar.get_height()/2,
                        f'{int(width):,}',
                        ha='left', va='center', fontsize=10,
                        bbox=dict(facecolor='white', alpha=0.8, edgecolor='none'))
            
            # Customize appearance
            plt.yticks(range(len(cat_freq)), cat_freq.index)
            plt.title(f'{category} - Mentioned in Reviews', pad=20, fontsize=12)
            plt.xlabel('Number of Mentions')
            plt.grid(True, alpha=0.2)
    
    plt.tight_layout()
    plt.show()

    # Print summary table
    print("\nKey Terms Analysis Summary")
    print("-" * 50)
    for category, terms in categories.items():
        print(f"\n{category}:")
        cat_freq = word_freq[word_freq.index.isin(terms)].sort_values(ascending=False)
        if len(cat_freq) > 0:
            for term, freq in cat_freq.items():
                percentage = (freq / len(df)) * 100
                print(f"{term}: {freq:,} mentions ({percentage:.1f}% of reviews)")

def create_key_metrics_dashboard(df):
    """
    Create a dashboard with key business metrics
    """
    # Calculate key metrics
    total_reviews = len(df)
    avg_rating = df['overall'].mean()
    
    # Calculate sentiment distribution
    sentiment_dist = df['sentiment_category'].value_counts()
    sentiment_pct = sentiment_dist / len(df) * 100
    
    # Common issues and positive aspects
    issue_terms = ['crash', 'bug', 'error', 'slow', 'difficult', 'confusing']
    positive_terms = ['great', 'excellent', 'love', 'perfect', 'awesome']
    
    issue_counts = {term: len(df[df['reviewText'].str.contains(term, case=False)]) 
                   for term in issue_terms}
    positive_counts = {term: len(df[df['reviewText'].str.contains(term, case=False)]) 
                      for term in positive_terms}
    
    # Create visualization
    fig = plt.figure(figsize=(16, 8))
    
    # 1. Sentiment Distribution Pie Chart
    plt.subplot(1, 3, 1)
    colors = {'Positive': '#2ecc71', 'Negative': '#e74c3c', 'Neutral': '#95a5a6'}
    plt.pie(sentiment_pct, labels=[f'{cat}\n{pct:.1f}%' 
            for cat, pct in zip(sentiment_dist.index, sentiment_pct)],
            colors=[colors[cat] for cat in sentiment_dist.index],
            autopct='',
            startangle=90)
    plt.title('Sentiment Distribution', pad=20)
    
    # 2. Common Issues Bar Chart
    plt.subplot(1, 3, 2)
    issue_data = pd.Series(issue_counts).sort_values(ascending=True)
    bars = plt.barh(range(len(issue_data)), issue_data.values, 
                   color=plt.cm.Reds(np.linspace(0.3, 0.7, len(issue_data))))
    plt.yticks(range(len(issue_data)), issue_data.index)
    plt.title('Common Issues Mentioned', pad=20)
    
    # Add value labels
    for bar in bars:
        width = bar.get_width()
        plt.text(width, bar.get_y() + bar.get_height()/2,
                f'{int(width):,}',
                ha='left', va='center')
    
    # 3. Positive Aspects Bar Chart
    plt.subplot(1, 3, 3)
    positive_data = pd.Series(positive_counts).sort_values(ascending=True)
    bars = plt.barh(range(len(positive_data)), positive_data.values,
                    color=plt.cm.Greens(np.linspace(0.3, 0.7, len(positive_data))))
    plt.yticks(range(len(positive_data)), positive_data.index)
    plt.title('Positive Aspects Mentioned', pad=20)
    
    # Add value labels
    for bar in bars:
        width = bar.get_width()
        plt.text(width, bar.get_y() + bar.get_height()/2,
                f'{int(width):,}',
                ha='left', va='center')
    
    plt.tight_layout()
    plt.show()
    
    # Print summary statistics
    print("\nKey Business Metrics Summary:")
    print("-" * 50)
    print(f"Total Reviews Analyzed: {total_reviews:,}")
    print(f"Average Rating: {avg_rating:.2f}/5.0")
    print("\nSentiment Distribution:")
    for cat, pct in sentiment_pct.items():
        print(f"{cat}: {pct:.1f}%")

# Function to run all analyses
def run_business_analysis(df):
    print("Business Analysis of Kindle App Reviews")
    print("=" * 50)
    
    # Run analyses
    analyze_key_terms(df)
    create_key_metrics_dashboard(df)
    
    # Print recommendations
    print("\nKey Recommendations:")
    print("-" * 50)
    positive_ratio = len(df[df['sentiment_category'] == 'Positive']) / len(df)
    
    if positive_ratio < 0.6:
        print("1. Focus on improving overall user satisfaction:")
        print("   - Address common issues in user feedback")
        print("   - Enhance features that receive positive mentions")
    
    print("2. Priority Areas for Improvement:")
    print("   - Technical Performance: Address crash reports and speed issues")
    print("   - User Interface: Simplify complex features")
    print("   - Feature Enhancement: Focus on most requested features")
    
    print("\n3. Success Metrics to Track:")
    print("   - User satisfaction trends")
    print("   - Issue report frequency")
    print("   - Feature usage statistics")

# Use the analysis
run_business_analysis(df)