In [None]:
pip install httpx

In [None]:
import httpx
import asyncio
from transformers import pipeline, AutoTokenizer
import sqlite3
import os
import nest_asyncio
from datetime import datetime
import torch
nest_asyncio.apply()


GITHUB_API_URL = "https://api.github.com/repos/SeldonIO/seldon-core/issues"
GITHUB_TOKEN = 'XXXXX'

HEADERS = {
    'Authorization': f'token {GITHUB_TOKEN}',
    'User-Agent': 'score'
}

# Hugging Face Token

HF_TOKEN = os.getenv('xxxxxx')
device = 0 if torch.cuda.is_available() else -1

# Load Models
issue_type_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli",device=device)
severity_model = pipeline("zero-shot-classification", model="facebook/bart-large-mnli",device=device)

async def fetch_issues(session, state='open', per_page=100):
    issues = []
    page = 1
    max_concurrent_requests = 10

    async def fetch_page(page):
        params = {'page': page, 'per_page': per_page, 'state': state}
        try:
            response = await session.get(GITHUB_API_URL, headers=HEADERS, params=params)
            response.raise_for_status()
            return response.json()
        except httpx.RequestError as exc:
            print(f"An error occurred while requesting {exc.request.url!r}: {exc}")
            return []
        except httpx.HTTPStatusError as exc:
            print(f"Error response {exc.response.status_code} while requesting {exc.request.url!r}: {exc}")
            return []

    while True:
        tasks = [fetch_page(page + i) for i in range(max_concurrent_requests)]
        results = await asyncio.gather(*tasks)
        if not any(results):
            break

        for result in results:
            if result:
                issues.extend(result)

        page += max_concurrent_requests
        if len(results) < max_concurrent_requests:
            break

    return issues

def identify_issue_type(issue):
    labels = [label['name'].lower() for label in issue.get('labels', [])]
    title = (issue.get('title') or '').lower()

    # Prioritize labels
    if 'bug' in labels:
        return 'Bug', labels
    if 'feature request' in labels or 'enhancement' in labels:
        return 'Feature Request', labels

    # Then check titles
    if 'bug' in title:
        return 'Bug', labels
    if 'feature' in title:
        return 'Feature Request', labels

    # Use the zero-shot classifier as a last resort
    candidate_labels = ["Bug", "Feature Request", "Other"]
    classification = issue_type_model(title, candidate_labels=candidate_labels)
    return classification['labels'][0], labels

def classify_severity(issue, severity_model):
    labels = [label['name'].lower() for label in issue.get('labels', [])]
    body = issue.get('body', '')

    # Truncate the body to fit within the model's token limit (512 tokens)
    max_length = 512  # Maximum token length for the model
    truncated_body = body[:max_length]  # Truncate the body to the first 512 characters

    # Check for severity-specific keywords first
    if 'critical' in truncated_body.lower():
        return 'Critical'

    # Check the labels for severity
    for label in labels:
        if 'critical' in label:
            return 'Critical'
        elif 'high' in label:
            return 'High'
        elif 'medium' in label:
            return 'Medium'
        elif 'low' in label:
            return 'Low'

    # Use the zero-shot classification model if no severity label is found
    candidate_labels = ["Critical", "High", "Medium", "Low"]
    severity_result = severity_model(truncated_body, candidate_labels=candidate_labels)
    severity_label = severity_result['labels'][0].lower()  # Get the top predicted label

    return severity_label.capitalize()

def store_issues_in_db(issues, label, db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute('DROP TABLE IF EXISTS issues')
    c.execute('''
        CREATE TABLE IF NOT EXISTS issues (
            repo_name TEXT,
            id INTEGER,
            title TEXT,
            body TEXT,
            labels TEXT,
            state TEXT,
            created_at TEXT,
            updated_at TEXT,
            comments INTEGER,
            user TEXT,
            issue_type TEXT,
            severity TEXT,
            PRIMARY KEY (repo_name, id)
        )
    ''')

    issue_data = []
    for issue in issues:
        title = (issue.get('title') or '').strip()
        body = (issue.get('body') or '').strip()
        if not body:  # Skip issues with empty bodies
            continue

        issue_type, labels = identify_issue_type(issue)
        if issue_type in ['Bug', 'Feature Request']:
            severity = classify_severity(issue, severity_model)
            issue_data.append((
                label,
                issue['id'],
                title,
                body,
                ','.join(labels),
                issue['state'],
                issue['created_at'],
                issue['updated_at'],
                issue['comments'],
                issue['user']['login'],
                issue_type,
                severity
            ))

    c.executemany('''
        INSERT OR IGNORE INTO issues (repo_name, id, title, body, labels, state, created_at, updated_at, comments, user, issue_type, severity)
        VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
    ''', issue_data)
    conn.commit()
    conn.close()

async def main():
    async with httpx.AsyncClient() as session:
        issues = await fetch_issues(session)
        store_issues_in_db(issues,'score')

# Run the main function
asyncio.run(main())

def print_filtered_issues(db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()
    c.execute("SELECT * FROM issues WHERE issue_type IN ('Bug', 'Feature Request')")
    rows = c.fetchall()

    for row in rows:
        print(f"Issue ID: {row[1]}, Title: {row[2]}, Type: {row[10]}, Labels: {row[4]}")

    conn.close()

print_filtered_issues()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def load_issues_from_db(db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM issues WHERE issue_type IN ('Bug', 'Feature Request')", conn)
    conn.close()
    return df

df_issues = load_issues_from_db()
print(df_issues)


plt.figure(figsize=(10, 5))
df_issues['issue_type'].value_counts().plot(kind='bar', color=['blue', 'orange'])
plt.title('Distribution of Issues by Type')
plt.xlabel('Issue Type')
plt.ylabel('Count')
plt.show()



severity_color_map = {
   'Critical': '#FF0000',
   'High': '#FF8C00',  # Dark orange
   'Medium': '#FFD700',
   'Low': '#008000'
}

severity_counts = df_issues['severity'].value_counts()

colors = [severity_color_map[severity] for severity in severity_counts.index]

plt.figure(figsize=(10, 5))
severity_counts.plot(kind='bar', color=colors)
plt.title('Distribution of Severity Levels')
plt.xlabel('Severity Level')
plt.ylabel('Count')
plt.show()



In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data from the database
def load_issues_from_db(db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM issues WHERE issue_type IN ('Bug', 'Feature Request')", conn)
    conn.close()
    return df

df_issues = load_issues_from_db()

# Ensure 'severity' column exists
if 'severity' in df_issues.columns:
    print("Severity column exists")
else:
    print("Severity column is missing")

print(df_issues)



severity_palette = {
    'Critical': '#FF0000',
    'High': '#FF8C00',  # Dark orange
    'Medium': '#FFD700',
    'Low': '#008000'
}

plt.figure(figsize=(14, 7))
sns.countplot(
    data=df_issues,
    x='issue_type',
    hue='severity',
    palette=severity_palette  # Use the custom color palette
)
plt.title('Distribution of Issue Types and Severities')
plt.xlabel('Issue Type')
plt.ylabel('Count')
plt.legend(title='Severity')
plt.show()

In [None]:
df_critical_titles = df_issues[df_issues['severity'] == 'Critical']['title']

print(df_critical_titles)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def load_issues_from_db(db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM issues WHERE issue_type IN ('Bug', 'Feature Request')", conn)
    conn.close()
    return df

df_issues = load_issues_from_db()

# Convert created_at column to datetime
df_issues['created_at'] = pd.to_datetime(df_issues['created_at'])

# Add a month column
df_issues['month'] = df_issues['created_at'].dt.to_period('M')

# Group by month and issue type, then count the number of issues
df_grouped = df_issues.groupby(['month', 'issue_type']).size().unstack().fillna(0)

# Plotting
plt.figure(figsize=(14, 7))
df_grouped.plot(marker='o')
plt.title('Trend of Issues Over Time')
plt.xlabel('Date')
plt.ylabel('Number of Issues')
plt.legend(title='Issue Type')
plt.grid(True)
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import sqlite3

# Load the issues data from the SQLite database
def load_issues_from_db(db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM issues WHERE issue_type IN ('Bug', 'Feature Request')", conn)
    conn.close()
    return df

# Categorize and prioritize issues
def categorize_and_prioritize_issues(df):
    # Group by issue type and severity
    issue_counts = df.groupby(['issue_type', 'severity']).size().unstack(fill_value=0)

    # Prioritize issues by frequency
    top_issues = df['title'].value_counts().head(10)

    print("Issue Counts by Type and Severity:")
    print(issue_counts)

    print("\nTop 10 Frequent Issues:")
    print(top_issues)

    # Visualize the categorization
    issue_counts.plot(kind='bar', stacked=True)
    plt.title('Issue Counts by Type and Severity')
    plt.xlabel('Issue Type')
    plt.ylabel('Count')
    plt.show()

# Temporal trends: Plot issues over time
def plot_issues_over_time(df):
    df['created_at'] = pd.to_datetime(df['created_at'])
    df.set_index('created_at').resample('M').size().plot(kind='line')
    plt.title('Issues Over Time')
    plt.xlabel('Month')
    plt.ylabel('Number of Issues')
    plt.show()



def plot_severity_distribution(df):
    # Define the custom color palette
    severity_color_map = {
        'Critical': '#FF0000',
        'High': '#FF8C00',  # Dark orange
        'Medium': '#FFD700',
        'Low': '#008000'
    }

    # Count the severity levels and ensure consistent ordering
    severity_order = ['Critical', 'High', 'Medium', 'Low']
    severity_counts = df['severity'].value_counts()
    severity_counts = severity_counts.reindex(severity_order, fill_value=0)

    # Map colors to severity levels
    colors = [severity_color_map[severity] for severity in severity_counts.index]

    # Plot the pie chart
    severity_counts.plot(
        kind='pie',
        autopct='%1.1f%%',
        startangle=140,
        colors=colors
    )
    plt.title('Severity Distribution')
    plt.ylabel('')
    plt.show()


# Analyze feature requests
def analyze_feature_requests(df):
    feature_requests = df[df['issue_type'] == 'Feature Request']
    top_features = feature_requests['title'].value_counts().head(10)

    print("Top 10 Feature Requests:")
    print(top_features)

    # Visualize top feature requests
    top_features.plot(kind='bar')
    plt.title('Top 10 Feature Requests')
    plt.xlabel('Feature Request')
    plt.ylabel('Count')
    plt.show()

def analyze_issues(db_path='issues.db'):
    df_issues = load_issues_from_db(db_path)

    categorize_and_prioritize_issues(df_issues)

    plot_issues_over_time(df_issues)

    plot_severity_distribution(df_issues)

    analyze_feature_requests(df_issues)

# Run the analysis
analyze_issues()


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

def plot_frequency_severity_heatmap(df):
    heatmap_data = df.pivot_table(index='issue_type', columns='severity', aggfunc='size', fill_value=0)
    sns.heatmap(heatmap_data, annot=True, fmt='d', cmap='YlGnBu')
    plt.title('Frequency and Severity Heatmap')
    plt.xlabel('Severity')
    plt.ylabel('Issue Type')
    plt.show()

plot_frequency_severity_heatmap(df_issues)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3
from wordcloud import WordCloud
from statsmodels.tsa.seasonal import seasonal_decompose
from textblob import TextBlob


def plot_severity_boxplot(df, metric='comments'):
    sns.boxplot(x='severity', y=metric, data=df)
    plt.title(f'{metric.capitalize()} by Severity')
    plt.xlabel('Severity')
    plt.ylabel(metric.capitalize())
    plt.show()





# Sentiment Analysis
def perform_sentiment_analysis(df):
    df['sentiment'] = df['body'].apply(lambda x: TextBlob(x).sentiment.polarity)
    sns.boxplot(x='issue_type', y='sentiment', data=df)
    plt.title('Sentiment Analysis of Issues')
    plt.xlabel('Issue Type')
    plt.ylabel('Sentiment Polarity')
    plt.show()

def analyze_issues(db_path='issues.db'):

    plot_severity_boxplot(df_issues, metric='comments')


    perform_sentiment_analysis(df_issues)

analyze_issues()


In [None]:
import pandas as pd
import sqlite3

def fetch_and_print_issues(db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    c = conn.cursor()

    c.execute("SELECT issue_type, title, severity FROM issues WHERE issue_type IN ('Bug', 'Feature Request')")
    rows = c.fetchall()

    conn.close()

    df = pd.DataFrame(rows, columns=['type', 'detail', 'severity'])

    print(df)
    print(f"Number of rows: {len(df)}")

fetch_and_print_issues()


In [None]:

import pandas as pd
import sqlite3
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')




custom_stopwords = set(stopwords.words('english')).union({
    'adding', 'protects', 'code', 'implementation', 'check', 'endpoint', 'key', 'function'
})



def load_issues_from_db(db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM issues WHERE issue_type IN ('Bug', 'Feature Request')", conn)
    conn.close()
    return df

def preprocess_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)

    text = text.lower()

    words = text.split()

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

def preprocess_issues(df):
    df['processed_body'] = df['body'].apply(preprocess_text)
    return df

df_issues = load_issues_from_db()
df_issues = preprocess_issues(df_issues)

print(df_issues[['body', 'processed_body']].head())


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_keywords(df, n_keywords=10):
    vectorizer = TfidfVectorizer(max_features=n_keywords)
    X = vectorizer.fit_transform(df['processed_body'])

    keywords = vectorizer.get_feature_names_out()
    tfidf_scores = X.toarray()

    for i, keyword in enumerate(keywords):
        df[keyword] = tfidf_scores[:, i]

    return df, keywords

df_issues, keywords = extract_keywords(df_issues)

print(df_issues.head())
print("Extracted Keywords:", keywords)


In [None]:
categories = {
    'Bug Report': [
        'bug', 'error', 'failure', 'issue', 'crash', 'exception', 'fault',
        'defect', 'glitch', 'malfunction', 'problem', 'breakdown', 'hang',
        'freeze', 'stop', 'wrong', 'unexpected', 'terminate', 'shutdown', 'abort',
        'buggy', 'unresponsive', 'deadlock', 'incorrect'
    ],
    'Performance Issue': [
        'performance', 'slow', 'latency', 'lag', 'delay', 'inefficient',
        'sluggish', 'unresponsive', 'hang', 'freeze', 'optimization', 'speed',
        'throughput', 'memory leak', 'resource', 'bottleneck', 'load', 'scalability',
        'high CPU', 'high memory', 'overhead', 'resource utilization'
    ],
    'Compatibility Problem': [
        'compatibility', 'support', 'platform', 'integration', 'incompatible',
        'conflict', 'portability', 'version', 'dependency', 'interop', 'environment',
        'OS', 'system', 'architecture', 'framework', 'library', 'API', 'browser',
        'device', 'hardware', 'software', 'backward', 'forward', 'compliance',
        'adaptation', 'adoption', 'standard', 'specification'
    ],
    'Enhancement Request': [
        'enhancement', 'feature', 'request', 'improve', 'add', 'upgrade',
        'update', 'expand', 'extend', 'new functionality', 'suggestion', 'proposal',
        'better', 'enhance', 'more', 'increase', 'boost', 'develop', 'modify',
        'refine', 'revamp', 'strengthen', 'superior', 'augmented', 'additional',
        'extra', 'integrate', 'customize', 'personalize', 'streamline', 'innovate'
    ]
}


def categorize_issue(body):
    body = body.lower()
    for category, keywords in categories.items():
        if any(keyword in body for keyword in keywords):
            return category
    return 'Other'

df_issues['category'] = df_issues['processed_body'].apply(categorize_issue)

print(df_issues[['body', 'processed_body', 'category']].head())


In [None]:
def save_processed_issues_to_db(df, db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    df.to_sql('issues', conn, if_exists='replace', index=False)
    conn.close()

def preprocess_and_save_issues(db_path='issues.db'):
    df_issues = load_issues_from_db(db_path)

    df_issues = preprocess_issues(df_issues)

    df_issues['category'] = df_issues['processed_body'].apply(categorize_issue)

    save_processed_issues_to_db(df_issues, db_path)

preprocess_and_save_issues()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_top_keywords(df, category, n_keywords=10):
    category_issues = df[df['category'] == category]
    vectorizer = TfidfVectorizer(max_features=n_keywords)
    X = vectorizer.fit_transform(category_issues['processed_body'])

    tfidf_scores = X.toarray().sum(axis=0)
    keywords = vectorizer.get_feature_names_out()

    keyword_scores = dict(zip(keywords, tfidf_scores))
    sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)

    return sorted_keywords

def plot_top_keywords(keywords, title):
    keywords, scores = zip(*keywords)
    plt.figure(figsize=(10, 6))
    plt.barh(keywords, scores, color='skyblue')
    plt.title(title)
    plt.xlabel('TF-IDF Score')
    plt.ylabel('Keywords')
    plt.gca().invert_yaxis()
    plt.show()

def analyze_trending_issues(df):
    feature_keywords = extract_top_keywords(df, 'Enhancement Request')
    bug_keywords = extract_top_keywords(df, 'Bug Report')

    print("Top Keywords in Feature Requests:")
    print(feature_keywords)

    print("Top Keywords in Bug Reports:")
    print(bug_keywords)

    plot_top_keywords(feature_keywords, 'Top Keywords in Feature Requests')
    plot_top_keywords(bug_keywords, 'Top Keywords in Bug Reports')

def analyze_issues(db_path='issues.db'):
    df_issues = load_issues_from_db(db_path)

    plot_severity_distribution(df_issues)

    critical_issues = df_issues[df_issues['severity'].str.lower() == 'critical']

    analyze_trending_issues(df_issues)

analyze_issues()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
import pandas as pd
import re

def extract_keywords_and_phrases(df, category, n_keywords=10):
    category_issues = df[df['category'] == category]
    vectorizer = TfidfVectorizer(max_features=n_keywords, stop_words='english')
    X = vectorizer.fit_transform(category_issues['processed_body'])

    tfidf_scores = X.toarray().sum(axis=0)
    keywords = vectorizer.get_feature_names_out()

    keyword_scores = dict(zip(keywords, tfidf_scores))
    sorted_keywords = sorted(keyword_scores.items(), key=lambda x: x[1], reverse=True)

    keyword_phrases = defaultdict(list)

    for keyword, _ in sorted_keywords:
        for body in category_issues['processed_body']:
            if keyword in body:
                phrases = re.findall(r'([^.]*?\b' + re.escape(keyword) + r'\b[^.]*\.)', body)
                keyword_phrases[keyword].extend(phrases)

    return sorted_keywords, keyword_phrases


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re

def extract_top_phrases_with_titles(df, category, n_phrases=10):
    category_issues = df[df['category'] == category]

    vectorizer = TfidfVectorizer(max_features=n_phrases, stop_words='english', ngram_range=(3, 3))
    X = vectorizer.fit_transform(category_issues['processed_body'])

    tfidf_scores = X.toarray().sum(axis=0)
    phrases = vectorizer.get_feature_names_out()

    phrase_scores = dict(zip(phrases, tfidf_scores))
    sorted_phrases = sorted(phrase_scores.items(), key=lambda x: x[1], reverse=True)

    phrase_titles = defaultdict(list)

    for phrase, _ in sorted_phrases:
        for body, title in zip(category_issues['processed_body'], category_issues['title']):
            if phrase in body:
                phrase_titles[phrase].append(title)

    return sorted_phrases, phrase_titles


In [None]:
import plotly.express as px
import pandas as pd

def visualize_phrases_with_titles(phrases, phrase_titles, title):
    phrases, scores = zip(*phrases)
    titles = ["<br>".join(phrase_titles[phrase][:3]) for phrase in phrases]  # Limiting to top 3 titles for clarity

    df = pd.DataFrame({'Phrase': phrases, 'Score': scores, 'Titles': titles})

    fig = px.bar(df, x='Score', y='Phrase', orientation='h',
                 hover_data={'Titles': True},
                 labels={'Score': 'TF-IDF Score', 'Phrase': 'Phrases'},
                 title=title)

    fig.update_layout(
        yaxis=dict(tickmode='linear'),
        hovermode="closest",
        height=600,
        margin=dict(l=200, r=20, t=60, b=20)
    )

    fig.show()

def analyze_phrases_and_titles(df, category, title):
    top_phrases, phrase_titles = extract_top_phrases_with_titles(df, category)
    visualize_phrases_with_titles(top_phrases, phrase_titles, title)

def load_issues_from_db(db_path='issues.db'):
    conn = sqlite3.connect(db_path)
    df = pd.read_sql_query("SELECT * FROM issues WHERE issue_type IN ('Bug', 'Feature Request')", conn)
    conn.close()
    return df

def analyze_issues(db_path='issues.db'):
    df_issues = load_issues_from_db(db_path)

    analyze_phrases_and_titles(df_issues, 'Enhancement Request', 'Top Phrases and Titles in Feature Requests')
    analyze_phrases_and_titles(df_issues, 'Bug Report', 'Top Phrases and Titles in Bug Reports')

analyze_issues()


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

def get_top_phrases_and_titles(df_issues, top_n=10):
    critical_issues = df_issues[df_issues['severity'] == 'Critical']

    bodies = critical_issues['body'].tolist()

    vectorizer = TfidfVectorizer(ngram_range=(2, 3), stop_words='english')  # Bigrams and Trigrams
    tfidf_matrix = vectorizer.fit_transform(bodies)
    feature_names = vectorizer.get_feature_names_out()

    phrase_scores = tfidf_matrix.sum(axis=0).A1
    phrase_scores_dict = dict(zip(feature_names, phrase_scores))

    top_phrases = Counter(phrase_scores_dict).most_common(top_n)

    top_phrases_data = []

    for phrase, score in top_phrases:
        for _, row in critical_issues.iterrows():
            if phrase in row['body']:
                top_phrases_data.append({
                    'Phrase': phrase,
                    'Issue Title': row['title']
                })

    top_phrases_df = pd.DataFrame(top_phrases_data)

    return top_phrases_df

top_phrases_df = get_top_phrases_and_titles(df_issues)

print(top_phrases_df)


In [None]:
import pandas as pd

# List of key phrases
key_phrases = [
    "LLM", "Large Language Model", "Language Model", "Transformer Model", "GPT", "BERT",
    "OpenAI", "Hugging Face", "Model Training", "Model Inference", "Prompt Engineering",
    "Model Fine-tuning", "Inference Latency", "Model Deployment", "Tokenization Issues",
    "Attention Mechanism", "Model Scaling", "Context Length", "Model Performance",
    "Natural Language Processing", "Pretrained Model", "Model Weights", "Parameter Tuning",
    "Model Outputs", "Bias in Models", "Model Interpretability",
    "Issues with LLM deployment", "Model inference time too high",
    "Error during model fine-tuning", "Unexpected output from GPT model",
    "LLM tokenization error", "Memory issue with large models"
]

def filter_issues_by_phrases(df_issues, key_phrases):
    filtered_issues = df_issues[
        df_issues['title'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)) |
        df_issues['body'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases))
    ]
    return filtered_issues

filtered_df = filter_issues_by_phrases(df_issues, key_phrases)

print(filtered_df[['title', 'body']])


In [None]:
import pandas as pd

key_phrases = [
    "LLM", "Large Language Model", "Language Model", "Transformer Model", "GPT", "BERT",
    "OpenAI", "Hugging Face", "Model Training", "Model Inference", "Prompt Engineering",
    "Model Fine-tuning", "Inference Latency", "Model Deployment", "Tokenization Issues",
    "Attention Mechanism", "Model Scaling", "Context Length", "Model Performance",
    "Natural Language Processing", "Pretrained Model", "Model Weights", "Parameter Tuning",
    "Model Outputs", "Bias in Models", "Model Interpretability",
    "Issues with LLM deployment", "Model inference time too high",
    "Error during model fine-tuning", "Unexpected output from GPT model",
    "LLM tokenization error", "Memory issue with large models"
]

def filter_issues_by_phrases_and_severity(df_issues, key_phrases):
    filtered_issues = df_issues[
        ((df_issues['severity'] == 'High') | (df_issues['severity'] == 'Critical')) &
        (df_issues['title'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)) |
         df_issues['body'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)))
    ]
    return filtered_issues

filtered_df = filter_issues_by_phrases_and_severity(df_issues, key_phrases)

print(filtered_df[['title', 'severity', 'body']])


In [None]:
import pandas as pd

# List of key phrases
key_phrases = [
  "High GPU usage", "Computational overhead", "Hardware limitations", "GPU/TPU requirement",
    "Data preprocessing", "Data quality issues", "Training data imbalance", "Data bias",
    "Scalability challenges", "Infrastructure scaling", "Inference latency", "Resource optimization",
    "Model fine-tuning", "Prompt engineering", "Quantization", "Model compression",
    "High operational cost", "Cost of inference", "API throttling", "Energy consumption",
    "Bias in LLMs", "Ethical AI deployment", "Fairness in AI", "Model transparency",
    "Security vulnerabilities", "Adversarial attacks", "Data privacy", "Model security",
    "Black box model", "Interpretability challenges", "Explainability", "Model transparency",
    "Compliance with GDPR", "AI regulations", "Data protection laws", "Legal risks",
    "Inference accuracy", "Model hallucination", "Response consistency", "Performance degradation"
]

def filter_issues_by_phrases_and_severity(df_issues, key_phrases):
    filtered_issues = df_issues[
        ((df_issues['severity'] == 'High') | (df_issues['severity'] == 'Critical') | (df_issues['severity'] == 'Medium')) &
        (df_issues['title'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)) |
         df_issues['body'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)))
    ]
    return filtered_issues

filtered_df = filter_issues_by_phrases_and_severity(df_issues, key_phrases)

print(filtered_df[['title', 'severity', 'body']])



In [None]:
import pandas as pd

# List of key phrases
key_phrases = [
    "LLM", "Large Language Model", "Language Model", "Transformer Model", "GPT", "BERT",
    "OpenAI", "Hugging Face", "Model Training", "Model Inference", "Prompt Engineering",
    "Model Fine-tuning", "Inference Latency", "Model Deployment", "Tokenization Issues",
    "Attention Mechanism", "Model Scaling", "Context Length", "Model Performance",
    "Natural Language Processing", "Pretrained Model", "Model Weights", "Parameter Tuning",
    "Model Outputs", "Bias in Models", "Model Interpretability",
    "Issues with LLM deployment", "Model inference time too high",
    "Error during model fine-tuning", "Unexpected output from GPT model",
    "LLM tokenization error", "Memory issue with large models"
]

def filter_issues_by_phrases_and_severity(df_issues, key_phrases):
    filtered_issues = df_issues[
        ((df_issues['severity'] == 'High') | (df_issues['severity'] == 'Critical')) &
        (df_issues['title'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)) |
         df_issues['body'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)))
    ]
    return filtered_issues

pd.set_option('display.max_colwidth', None)

filtered_df = filter_issues_by_phrases_and_severity(df_issues, key_phrases)

print(filtered_df[['title', 'severity', 'body']])


In [None]:
import pandas as pd

key_phrases = [
    "reducing hallucinations",
    "Adversarial attacks",
    "mitigating hallucinations in LLMs",
    "reward functions to penalize hallucinations",
    "context-aware generation",
    "chain-of-thought prompting",
    "optimizing context length",
    "retrieval-augmented generation (RAG)",
    "prompt engineering",
    "efficiency in context processing",
    "multimodal LLMs",
    "text and image integration",
    "multimodal data in LLMs",
    "cost reduction in LLMs",
    "efficient LLMs",
    "LLM memory optimization",
    "faster LLM models",
    "alternative LLM architectures",
    "transformer alternatives",
    "scalable LLM architectures",
    "self-attention in transformers",
    "improving transformer efficiency",
    "LLM hallucination detection",
    "context learning in LLMs",
    "scaling LLMs efficiently",
    "improving LLM reliability",
    "long-context LLM processing",
    "modality fusion in AI",
    "multimodal embeddings",
    "LLM interpretability",
    "distributed LLM training",
    "flash attention",
    "watermarking in LLMs",
    "LLM bias mitigation"
]


def filter_issues_by_phrases_and_severity(df_issues, key_phrases):
    filtered_issues = df_issues[
        ((df_issues['severity'] == 'High') | (df_issues['severity'] == 'Critical')) &
        (df_issues['title'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)) |
         df_issues['body'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)))
    ]
    return filtered_issues

pd.set_option('display.max_colwidth', None)

filtered_df = filter_issues_by_phrases_and_severity(df_issues, key_phrases)

print(filtered_df[['title', 'severity', 'body']])


In [None]:
import pandas as pd

key_phrases = [
    "reducing hallucinations",
    "mitigating hallucinations in LLMs",
    "reward functions to penalize hallucinations",
    "context-aware generation",
    "chain-of-thought prompting",
    "optimizing context length",
    "retrieval-augmented generation (RAG)",
    "prompt engineering",
    "efficiency in context processing",
    "multimodal LLMs",
    "text and image integration",
    "multimodal data in LLMs",
    "cost reduction in LLMs",
    "efficient LLMs",
    "LLM memory optimization",
    "faster LLM models",
    "alternative LLM architectures",
    "transformer alternatives",
    "scalable LLM architectures",
    "self-attention in transformers",
    "improving transformer efficiency",
    "LLM hallucination detection",
    "context learning in LLMs",
    "scaling LLMs efficiently",
    "improving LLM reliability",
    "long-context LLM processing",
    "modality fusion in AI",
    "multimodal embeddings",
    "LLM interpretability",
    "distributed LLM training",
    "flash attention",
    "watermarking in LLMs",
    "LLM bias mitigation"
]


def filter_issues_by_phrases_and_severity(df_issues, key_phrases):
    filtered_issues = df_issues[
        ((df_issues['severity'] == 'High') | (df_issues['severity'] == 'Critical')) &
        (df_issues['title'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)) |
         df_issues['body'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)))
    ]
    return filtered_issues

pd.set_option('display.max_colwidth', None)

filtered_df = filter_issues_by_phrases_and_severity(df_issues, key_phrases)

print(filtered_df[['title', 'severity', 'body']])


In [None]:
import pandas as pd

key_phrases = [
  "prompt caching",
  "LLM prompt optimization",
  "prompt reuse",
  "cached prompts",
  "inference caching",
  "query caching",
  "LLM response caching",
  "prompt result caching",
  "prompt-response pairs",
  "cached embeddings",
  "model prompt cache",
  "token cache",
  "memory-efficient prompt handling",
  "cached LLM output",
  "zero-shot prompt caching",
  "prompt retrieval optimization",
  "preprocessed prompt cache",
  "inference optimization",
  "prompt persistence",
  "request-response caching"
]

# Function to filter issues by key phrases and severity
def filter_issues_by_phrases_and_severity(df_issues, key_phrases):
    filtered_issues = df_issues[
        ((df_issues['severity'] == 'High') | (df_issues['severity'] == 'Critical')) &
        (df_issues['title'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)) |
         df_issues['body'].apply(lambda x: any(phrase.lower() in x.lower() for phrase in key_phrases)))
    ]
    return filtered_issues

pd.set_option('display.max_colwidth', None)

filtered_df = filter_issues_by_phrases_and_severity(df_issues, key_phrases)

print(filtered_df[['title', 'severity', 'body']])
