# AI-Powered Intelligence for Agile Teams
### 1. Data Understanding

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from textblob import TextBlob

In [None]:
# Read the file
# csv_path = "GFG_FINAL.csv"
# readfile = pd.read_csv(csv_path)
excel_path = "../data/GFG_FINAL-modified.xlsx"
readfile = pd.read_excel(excel_path)

In [None]:
readfile

In [None]:
readfile.columns.tolist()

In [None]:
len(readfile.columns)

In [None]:
len(readfile)

In [None]:
# new = readfile[['Issue.id', 'Issue.Type', 'Priority', 'Project.type', 'Status', 'Created', 'Resolution', 'Resolved', 'Component.s', 'Custom.field..Symptom.Severity.', 'Custom.field..Company.', 'Custom.field..Date.of.First.Response.', 'Votes', 'Environment', 'Custom.field..Original.story.points.']].copy()
new = readfile[['Summary', 'Issue key', 'Issue Type', 'Status', 'Created', 'Resolved', 'Comment', 'Comment.1', 'Comment.2', 'Comment.3', 'Comment.4', 'Comment.5', 'Comment.6', 'Comment.7', 'Comment.8', 'Comment.9', 'Comment.10', 'Comment.11', 'Comment.12', 'Comment.13', 'Comment.14', 'Comment.15', 'Comment.16', 'Comment.17', 'Comment.18', 'Comment.19', 'Comment.20', 'Comment.21', 'Comment.22', 'Comment.23', 'Comment.24', 'Comment.25', 'Comment.26', 'Comment.27', 'Comment.28', 'Comment.29', 'Comment.30', 'Comment.31', 'Comment.32', 'Comment.33', 'Comment.34', 'Comment.35', 'Comment.36', 'Comment.37', 'Comment.38', 'Comment.39', 'Comment.40', 'Comment.41', 'Comment.42', 'Comment.43', 'Comment.44', 'Comment.45', 'Comment.46', 'Comment.47', 'Comment.48', 'Comment.49', 'Comment.50', 'Comment.51', 'Comment.52', 'Comment.53', 'Comment.54', 'Comment.55', 'Comment.56', 'Comment.57', 'Comment.58', 'Comment.59', 'Comment.60', 'Comment.61', 'Comment.62', 'Comment.63', 'Comment.64', 'Comment.65', 'Comment.66', 'Comment.67', 'Comment.68', 'Comment.69', 'Comment.70', 'Comment.71', 'Comment.72', 'Comment.73', 'Comment.74', 'Comment.75', 'Comment.76', 'Comment.77', 'Comment.78', 'Comment.79', 'Comment.80', 'Comment.81', 'Comment.82', 'Comment.83', 'Comment.84']].copy()
new

In [None]:
plt.figure(figsize=(10, 6))
new['Issue Type'].value_counts().plot(kind='bar')
plt.title('Issue Type Distribution')
plt.xlabel('Issue Type')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### 2. Sentiment Analysis on Comment Columns


In [None]:
# Identify all comment columns (columns that start with "Comment")
comment_columns = [col for col in new.columns if col.startswith('Comment')]
print(f"Found {len(comment_columns)} comment columns:")
print(comment_columns)


In [None]:
# Function to analyze sentiment of a text
def analyze_sentiment(text):
    """
    Analyze sentiment of a text string.
    Returns: (polarity, subjectivity, sentiment_label)
    - polarity: ranges from -1 (negative) to 1 (positive)
    - subjectivity: ranges from 0 (objective) to 1 (subjective)
    - sentiment_label: 'positive', 'negative', or 'neutral'
    """
    if pd.isna(text) or str(text).strip() == '':
        return (np.nan, np.nan, 'neutral')
    
    try:
        blob = TextBlob(str(text))
        polarity = blob.sentiment.polarity
        subjectivity = blob.sentiment.subjectivity
        
        # Classify sentiment
        if polarity > 0.1:
            sentiment_label = 'positive'
        elif polarity < -0.1:
            sentiment_label = 'negative'
        else:
            sentiment_label = 'neutral'
        
        return (polarity, subjectivity, sentiment_label)
    except:
        return (np.nan, np.nan, 'neutral')


In [None]:
# Perform sentiment analysis on all comment columns
print("Performing sentiment analysis on comment columns...")
sentiment_results = {}
new_columns_dict = {}

for col in comment_columns:
    print(f"Processing {col}...")
    sentiments = new[col].apply(analyze_sentiment)
    
    # Extract polarity, subjectivity, and labels and Store results in a dictionary instead of assigning to DataFrame directly
    new_columns_dict[f'{col}_polarity'] = sentiments.apply(lambda x: x[0])
    new_columns_dict[f'{col}_subjectivity'] = sentiments.apply(lambda x: x[1])
    new_columns_dict[f'{col}_sentiment'] = sentiments.apply(lambda x: x[2])
    
    # Store summary statistics
    sentiment_results[col] = {
        'total_comments': new[col].notna().sum(),
        'non_empty_comments': (new[col].notna() & (new[col].astype(str).str.strip() != '')).sum(),
        'avg_polarity': new_columns_dict[f'{col}_polarity'].mean(),
        'avg_subjectivity': new_columns_dict[f'{col}_subjectivity'].mean(),
        'positive_count': (new_columns_dict[f'{col}_sentiment'] == 'positive').sum(),
        'negative_count': (new_columns_dict[f'{col}_sentiment'] == 'negative').sum(),
        'neutral_count': (new_columns_dict[f'{col}_sentiment'] == 'neutral').sum()
    }

# Concatenate all new columns at once to avoid DataFrame fragmentation
new = pd.concat([new, pd.DataFrame(new_columns_dict)], axis=1)

print("Sentiment analysis completed!")

In [None]:
# Create a summary dataframe for all comment columns
sentiment_summary = pd.DataFrame(sentiment_results).T
sentiment_summary = sentiment_summary.sort_values('non_empty_comments', ascending=False)
print("Sentiment Analysis Summary for All Comment Columns:")
print(sentiment_summary)


In [None]:
# Aggregate all comments into a single analysis
# Combine all comment columns into one series for overall analysis
all_comments = []
for col in comment_columns:
    non_empty = new[col].dropna()
    non_empty = non_empty[non_empty.astype(str).str.strip() != '']
    all_comments.extend(non_empty.tolist())

print(f"Total non-empty comments across all columns: {len(all_comments)}")

# Analyze overall sentiment
overall_sentiments = [analyze_sentiment(comment) for comment in all_comments]
overall_polarities = [s[0] for s in overall_sentiments if not np.isnan(s[0])]
overall_subjectivities = [s[1] for s in overall_sentiments if not np.isnan(s[1])]
overall_labels = [s[2] for s in overall_sentiments]

print(f"\nOverall Sentiment Statistics:")
print(f"Average Polarity: {np.mean(overall_polarities):.4f}")
print(f"Average Subjectivity: {np.mean(overall_subjectivities):.4f}")
print(f"\nSentiment Distribution:")
print(f"Positive: {overall_labels.count('positive')} ({overall_labels.count('positive')/len(overall_labels)*100:.2f}%)")
print(f"Negative: {overall_labels.count('negative')} ({overall_labels.count('negative')/len(overall_labels)*100:.2f}%)")
print(f"Neutral: {overall_labels.count('neutral')} ({overall_labels.count('neutral')/len(overall_labels)*100:.2f}%)")


In [None]:
# Collect sample comments from each sentiment category
print("="*80)
print("TOP COMMENTS FROM EACH SENTIMENT CATEGORY")
print("="*80)

# Create a dataframe with comments and their sentiments
comments_with_sentiment = []
for col in comment_columns:
    for idx in new.index:
        comment = new.loc[idx, col]
        if pd.notna(comment) and str(comment).strip() != '':
            sentiment_col = f'{col}_sentiment'
            polarity_col = f'{col}_polarity'
            sentiment = new.loc[idx, sentiment_col]
            polarity = new.loc[idx, polarity_col]
            if pd.notna(sentiment) and pd.notna(polarity):
                comments_with_sentiment.append({
                    'comment': str(comment),
                    'sentiment': sentiment,
                    'polarity': polarity,
                    'column': col
                })

comments_df = pd.DataFrame(comments_with_sentiment)

# Number of top comments to show per category
num_samples = 15

print("\n" + "="*80)
print(f"TOP {num_samples} POSITIVE COMMENTS (Highest Polarity)")
print("="*80)
positive_comments = comments_df[comments_df['sentiment'] == 'positive'].sort_values('polarity', ascending=False)
if len(positive_comments) > 0:
    for i, (idx, row) in enumerate(positive_comments.head(num_samples).iterrows(), 1):
        print(f"\n[{i}] Polarity: {row['polarity']:.4f} | Column: {row['column']}")
        print(f"Comment: {row['comment'][:600]}")  # Limit to 600 chars
        if len(row['comment']) > 600:
            print("... (truncated)")
else:
    print("No positive comments found.")

print("\n" + "="*80)
print(f"TOP {num_samples} NEGATIVE COMMENTS (Lowest Polarity)")
print("="*80)
negative_comments = comments_df[comments_df['sentiment'] == 'negative'].sort_values('polarity', ascending=True)
if len(negative_comments) > 0:
    for i, (idx, row) in enumerate(negative_comments.head(num_samples).iterrows(), 1):
        print(f"\n[{i}] Polarity: {row['polarity']:.4f} | Column: {row['column']}")
        print(f"Comment: {row['comment'][:600]}")  # Limit to 600 chars
        if len(row['comment']) > 600:
            print("... (truncated)")
else:
    print("No negative comments found.")

print("\n" + "="*80)
print(f"TOP {num_samples} NEUTRAL COMMENTS (Closest to Zero)")
print("="*80)
neutral_comments = comments_df[comments_df['sentiment'] == 'neutral'].copy()
neutral_comments['abs_polarity'] = neutral_comments['polarity'].abs()
neutral_comments = neutral_comments.sort_values('abs_polarity', ascending=True)
if len(neutral_comments) > 0:
    for i, (idx, row) in enumerate(neutral_comments.head(num_samples).iterrows(), 1):
        print(f"\n[{i}] Polarity: {row['polarity']:.4f} | Column: {row['column']}")
        print(f"Comment: {row['comment'][:600]}")  # Limit to 600 chars
        if len(row['comment']) > 600:
            print("... (truncated)")
else:
    print("No neutral comments found.")

print("\n" + "="*80)


In [None]:
# More detailed sample comments with better formatting
print("\n" + "="*80)
print("DETAILED TOP COMMENTS BY SENTIMENT CATEGORY")
print("="*80)

# Number of top samples to display per category
num_samples_detailed = 15

# Function to get top comments efficiently across all columns
def get_top_comments(sentiment_type, num_samples=15):
    """Get top comments of a specific sentiment type across all columns"""
    samples = []
    for col in comment_columns:
        sentiment_col = f'{col}_sentiment'
        polarity_col = f'{col}_polarity'
        
        # Get all matching comments from this column
        mask = (new[sentiment_col] == sentiment_type) & new[col].notna()
        matching_data = new[mask][[col, polarity_col]].copy()
        
        for idx, row in matching_data.iterrows():
            comment = str(row[col]).strip()
            if comment and comment != '':
                polarity = row[polarity_col]
                if pd.notna(polarity):
                    samples.append({
                        'comment': comment,
                        'polarity': polarity,
                        'column': col,
                        'index': idx
                    })
    
    # Convert to DataFrame and sort
    if len(samples) == 0:
        return pd.DataFrame()
    
    samples_df = pd.DataFrame(samples)
    
    # Sort based on sentiment type
    if sentiment_type == 'positive':
        # Sort by highest polarity (most positive first)
        samples_df = samples_df.sort_values('polarity', ascending=False)
    elif sentiment_type == 'negative':
        # Sort by lowest polarity (most negative first)
        samples_df = samples_df.sort_values('polarity', ascending=True)
    else:  # neutral
        # Sort by absolute polarity (closest to zero first)
        samples_df['abs_polarity'] = samples_df['polarity'].abs()
        samples_df = samples_df.sort_values('abs_polarity', ascending=True)
    
    return samples_df.head(num_samples)

# Get and display positive comments
print("\n" + "-"*80)
print(f"TOP {num_samples_detailed} POSITIVE COMMENTS (Highest Polarity)")
print("-"*80)
pos_samples = get_top_comments('positive', num_samples_detailed)
if len(pos_samples) > 0:
    for i, (idx, row) in enumerate(pos_samples.iterrows(), 1):
        print(f"\n[Sample {i}]")
        print(f"  Polarity Score: {row['polarity']:.4f}")
        print(f"  Source Column: {row['column']}")
        print(f"  Comment:")
        # Print comment with proper word wrapping
        comment_text = row['comment']
        if len(comment_text) > 800:
            print(f"  {comment_text[:800]}...")
        else:
            print(f"  {comment_text}")
else:
    print("No positive comments found.")

# Get and display negative comments
print("\n" + "-"*80)
print(f"TOP {num_samples_detailed} NEGATIVE COMMENTS (Lowest Polarity)")
print("-"*80)
neg_samples = get_top_comments('negative', num_samples_detailed)
if len(neg_samples) > 0:
    for i, (idx, row) in enumerate(neg_samples.iterrows(), 1):
        print(f"\n[Sample {i}]")
        print(f"  Polarity Score: {row['polarity']:.4f}")
        print(f"  Source Column: {row['column']}")
        print(f"  Comment:")
        comment_text = row['comment']
        if len(comment_text) > 800:
            print(f"  {comment_text[:800]}...")
        else:
            print(f"  {comment_text}")
else:
    print("No negative comments found.")

# Get and display neutral comments
print("\n" + "-"*80)
print(f"TOP {num_samples_detailed} NEUTRAL COMMENTS (Closest to Zero Polarity)")
print("-"*80)
neu_samples = get_top_comments('neutral', num_samples_detailed)
if len(neu_samples) > 0:
    for i, (idx, row) in enumerate(neu_samples.iterrows(), 1):
        print(f"\n[Sample {i}]")
        print(f"  Polarity Score: {row['polarity']:.4f}")
        print(f"  Source Column: {row['column']}")
        print(f"  Comment:")
        comment_text = row['comment']
        if len(comment_text) > 800:
            print(f"  {comment_text[:800]}...")
        else:
            print(f"  {comment_text}")
else:
    print("No neutral comments found.")

print("\n" + "="*80)


In [None]:
# Visualization 1: Overall Sentiment Distribution
plt.figure(figsize=(10, 6))
sentiment_counts = pd.Series(overall_labels).value_counts()
colors = {'positive': 'green', 'negative': 'red', 'neutral': 'gray'}
sentiment_counts.plot(kind='bar', color=[colors.get(x, 'blue') for x in sentiment_counts.index])
plt.title('Overall Sentiment Distribution Across All Comments')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
# Visualization 2: Polarity Distribution Histogram
plt.figure(figsize=(12, 6))
plt.hist(overall_polarities, bins=50, edgecolor='black', alpha=0.7)
plt.axvline(x=0, color='red', linestyle='--', linewidth=2, label='Neutral')
plt.title('Distribution of Sentiment Polarity Scores')
plt.xlabel('Polarity Score (Negative to Positive)')
plt.ylabel('Frequency')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Visualization 3: Average Polarity by Comment Column (Top 20 columns with most comments)
top_columns = sentiment_summary.head(20)
plt.figure(figsize=(14, 8))
plt.barh(range(len(top_columns)), top_columns['avg_polarity'], 
         color=['green' if x > 0 else 'red' if x < 0 else 'gray' for x in top_columns['avg_polarity']])
plt.yticks(range(len(top_columns)), top_columns.index)
plt.xlabel('Average Polarity Score')
plt.title('Average Sentiment Polarity by Comment Column (Top 20 by Comment Count)')
plt.axvline(x=0, color='black', linestyle='--', linewidth=1)
plt.tight_layout()
plt.show()


In [None]:
# Visualization 4: Sentiment Distribution by Comment Column (Top 10)
top_10_columns = sentiment_summary.head(10).index
fig, axes = plt.subplots(2, 5, figsize=(20, 10))
axes = axes.flatten()

# Define color mapping for sentiment labels
sentiment_colors = {'positive': 'green', 'negative': 'red', 'neutral': 'gray'}

for idx, col in enumerate(top_10_columns):
    sentiment_col = f'{col}_sentiment'
    sentiment_dist = new[sentiment_col].value_counts()
    colors = [sentiment_colors.get(x, 'blue') for x in sentiment_dist.index]
    sentiment_dist.plot(kind='bar', ax=axes[idx], color=colors, rot=45)
    axes[idx].set_title(f'{col}\n({sentiment_summary.loc[col, "non_empty_comments"]} comments)')
    axes[idx].set_xlabel('')
    axes[idx].set_ylabel('Count')

plt.suptitle('Sentiment Distribution for Top 10 Comment Columns', fontsize=16, y=1.02)
plt.tight_layout()
plt.show()


In [None]:
# Visualization 5: Polarity vs Subjectivity Scatter Plot
plt.figure(figsize=(12, 8))
# Sample for better visualization if too many points
if len(overall_polarities) > 5000:
    sample_indices = np.random.choice(len(overall_polarities), 5000, replace=False)
    sample_polarities = [overall_polarities[i] for i in sample_indices]
    sample_subjectivities = [overall_subjectivities[i] for i in sample_indices]
else:
    sample_polarities = overall_polarities
    sample_subjectivities = overall_subjectivities

plt.scatter(sample_polarities, sample_subjectivities, alpha=0.5, s=10)
plt.xlabel('Polarity (Negative to Positive)')
plt.ylabel('Subjectivity (Objective to Subjective)')
plt.title('Sentiment Analysis: Polarity vs Subjectivity')
plt.axhline(y=0.5, color='gray', linestyle='--', linewidth=1, alpha=0.5)
plt.axvline(x=0, color='gray', linestyle='--', linewidth=1, alpha=0.5)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()


In [None]:
# Detailed Summary Report
print("="*80)
print("SENTIMENT ANALYSIS SUMMARY REPORT")
print("="*80)
print(f"\nTotal Comment Columns Analyzed: {len(comment_columns)}")
print(f"Total Non-Empty Comments: {len(all_comments)}")

print("\n" + "-"*80)
print("OVERALL SENTIMENT METRICS")
print("-"*80)
print(f"Average Polarity: {np.mean(overall_polarities):.4f}")
print(f"  - Range: [{np.min(overall_polarities):.4f}, {np.max(overall_polarities):.4f}]")
print(f"  - Standard Deviation: {np.std(overall_polarities):.4f}")

print(f"\nAverage Subjectivity: {np.mean(overall_subjectivities):.4f}")
print(f"  - Range: [{np.min(overall_subjectivities):.4f}, {np.max(overall_subjectivities):.4f}]")
print(f"  - Standard Deviation: {np.std(overall_subjectivities):.4f}")

print("\n" + "-"*80)
print("SENTIMENT LABEL DISTRIBUTION")
print("-"*80)
label_counts = pd.Series(overall_labels).value_counts()
for label, count in label_counts.items():
    percentage = (count / len(overall_labels)) * 100
    print(f"{label.capitalize()}: {count:,} ({percentage:.2f}%)")

print("\n" + "-"*80)
print("TOP 10 COMMENT COLUMNS BY COMMENT COUNT")
print("-"*80)
top_10_summary = sentiment_summary.head(10)
for col in top_10_summary.index:
    print(f"\n{col}:")
    print(f"  - Non-empty comments: {top_10_summary.loc[col, 'non_empty_comments']}")
    print(f"  - Average polarity: {top_10_summary.loc[col, 'avg_polarity']:.4f}")
    print(f"  - Positive: {top_10_summary.loc[col, 'positive_count']}, "
          f"Negative: {top_10_summary.loc[col, 'negative_count']}, "
          f"Neutral: {top_10_summary.loc[col, 'neutral_count']}")

print("\n" + "="*80)


In [None]:
# Export sentiment summary to CSV
sentiment_summary.to_csv('../results/sentiment_analysis_summary.csv')
print("Sentiment analysis summary exported to '../results/sentiment_analysis_summary.csv'")
