# Bug Reports Analysis Notebook

This notebook analyzes bug reports from a multilingual dataset. It includes:
- Language distribution analysis
- Bug reports over time visualization
- Common labels analysis
- Top bigrams and trigrams extraction
- Word count comparison

Ensure that the dataset `multilingual_labelled_translated.csv` is available in the working directory before running the notebook.


In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
import string
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Define custom stopwords
custom_stopwords = ['the', 'is', 'at', 'which', 'on', 'for', 'in', 'to', 'and', 'a', 'an', 'it', 'this', 'that', 'with',
                    'as', 'of', 'by', 'from', 'or', 'be', 'can', 'not', 'but', 'if', 'are', 'will', 'so', 'what',
                    'when', 'where', 'how', 'have', 'has', 'we', 'i', 'you', 'they', 'their', 'was', 'were', 'do',
                    'does', 'did', 'about', 'after', 'before', 'just', 'like', 'now', 'only']


In [None]:
def simple_tokenize(text):
    if isinstance(text, str):
        tokens = text.lower().translate(str.maketrans("", "", string.punctuation)).split()
        return tokens
    return []


In [None]:
def graph_lang_distribution(df):
    language_counts = df['src_lang'].value_counts(dropna=False)
    plt.figure(figsize=(12, 6))
    language_counts.plot(kind='bar')
    plt.xlabel("Language")
    plt.ylabel("Number of Bug Reports")
    plt.title("Distribution of Bug Reports by Language")
    plt.xticks(rotation=45)
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.show()


In [None]:
def bug_reports_over_time(df):
    df['created_at'] = pd.to_datetime(df['created_at'], errors='coerce')
    df_time_series = df.groupby(df['created_at'].dt.date).size()
    plt.figure(figsize=(12, 6))
    df_time_series.plot(kind='line', marker='o', linestyle='-', color='green', alpha=0.7)
    plt.title("Bug Reports Over Time")
    plt.xlabel("Date")
    plt.ylabel("Number of Reports")
    plt.xticks(rotation=45)
    plt.grid(True)
    plt.show()


In [None]:
def top_20_labels(df):
    label_list = df['labels'].dropna().str.split(':').sum()
    label_counts = Counter(label_list)
    label_df = pd.DataFrame(label_counts.items(), columns=['Label', 'Count']).sort_values(by='Count', ascending=False)
    plt.figure(figsize=(12, 6))
    plt.barh(label_df['Label'][:20], label_df['Count'][:20], color='purple', alpha=0.7)
    plt.xlabel("Count")
    plt.ylabel("Labels")
    plt.title("Top 20 Most Common Labels in Bug Reports")
    plt.gca().invert_yaxis()
    plt.show()


In [None]:
def get_top_ngrams(corpus, ngram_range=(2, 2), top_n=20):
    vectorizer = CountVectorizer(ngram_range=ngram_range, stop_words=custom_stopwords)
    X = vectorizer.fit_transform(corpus)
    ngram_counts = X.sum(axis=0).tolist()[0]
    ngram_list = vectorizer.get_feature_names_out()
    sorted_ngrams = sorted(zip(ngram_list, ngram_counts), key=lambda x: x[1], reverse=True)[:top_n]
    return pd.DataFrame(sorted_ngrams, columns=['N-gram', 'Count'])


In [None]:
def top_bigrams_and_trigrams(df):
    df['translation'] = df['translation'].astype(str)
    df['clean_text'] = df['translation'].fillna('').str.lower().str.replace(r'[^a-z\s]', '', regex=True)
    top_bigrams = get_top_ngrams(df['clean_text'], ngram_range=(2, 2), top_n=20)
    top_trigrams = get_top_ngrams(df['clean_text'], ngram_range=(3, 3), top_n=20)
    plt.figure(figsize=(12, 6))
    plt.barh(top_bigrams['N-gram'], top_bigrams['Count'], color='blue', alpha=0.7)
    plt.xlabel("Count")
    plt.ylabel("Bigrams")
    plt.title("Top 20 Bigrams in Translated Bug Reports (After Removing Stopwords)")
    plt.gca().invert_yaxis()
    plt.show()
    plt.figure(figsize=(12, 6))
    plt.barh(top_trigrams['N-gram'], top_trigrams['Count'], color='green', alpha=0.7)
    plt.xlabel("Count")
    plt.ylabel("Trigrams")
    plt.title("Top 20 Trigrams in Translated Bug Reports (After Removing Stopwords)")
    plt.gca().invert_yaxis()
    plt.show()


In [None]:
def word_count(df):
    df['body'] = df['body'].astype(str)
    df['translation'] = df['translation'].astype(str)
    df['original_word_count'] = df['body'].apply(lambda x: len(x.split()))
    df['translated_word_count'] = df['translation'].apply(lambda x: len(x.split()))
    word_count_stats = pd.DataFrame({
        'Original Bug Reports': df['original_word_count'].describe(),
        'Translated Bug Reports': df['translated_word_count'].describe()
    })
    print(word_count_stats)


In [None]:
file_path = "multilingual_labelled_translated.csv"  # Update the filename accordingly
df = pd.read_csv(file_path)
df.head()
graph_lang_distribution(df)
bug_reports_over_time(df)
top_20_labels(df)
top_bigrams_and_trigrams(df)
word_count(df)
