# Issue First Response


This notebook will act as a starter if you want to create visualizations in a notebook environment before putting in dash app or for an easier dev environment

In [1]:
import psycopg2
import pandas as pd 
import sqlalchemy as salc
import json
import os
import datetime
import plotly.express as px
import datetime as dt
import plotly
import plotly.graph_objects as go


paths = ["../../comm_cage.json", "comm_cage.json", "../../config.json", "../config.json", "config.json", "../../copy_cage-padres.json"]

for path in paths:
    if os.path.exists(path):
        with open(path) as config_file:
            config = json.load(config_file)
        break
else:
    raise FileNotFoundError(f"None of the config files found: {paths}")



In [2]:

database_connection_string = 'postgresql+psycopg2://{}:{}@{}:{}/{}'.format(config['user'], config['password'], config['host'], config['port'], config['database'])

dbschema='augur_data'
engine = salc.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

This will allow you to get specific repo_ids. A few are listed for ease of use

In [3]:
repo_urls = ['https://github.com/chaoss/augur']

url_query = str(repo_urls)
url_query = url_query[1:-1]

repo_query = salc.sql.text(f"""
        SET SCHEMA 'augur_data';
        SELECT DISTINCT
            r.repo_id,
            r.repo_name
        FROM
            repo r
        JOIN repo_groups rg 
        ON r.repo_group_id = rg.repo_group_id
        WHERE
            r.repo_git in({url_query})
        """)


engine = salc.create_engine(
    database_connection_string,
    connect_args={'options': '-csearch_path={}'.format(dbschema)})

with engine.connect() as conn:
    rows = conn.execute(repo_query)
    
# t = engine.execute(repo_query)
results = rows.all()
repo_ids = [ row[0] for row in results]
repo_names = [ row[1] for row in results]
print(repo_ids)
print(repo_names)

[1]
['augur']


Below is the query used in the callback. Can copy and paste any query here

In [4]:
repo_statement = str(repo_ids)
repo_statement = repo_statement[1:-1]

query = salc.sql.text(f"""
                SELECT 
                    i.issue_id,
                    i.repo_id AS ID,
                    i.cntrb_id  AS cntrb_id,
                    M.msg_timestamp,
                    M.msg_cntrb_id,
                    i.created_at ,
                    i.closed_at
                FROM
                    issues i 
                LEFT OUTER JOIN 
                    (
                        SELECT 
                            imr.issue_id AS issue_id ,
                            m.msg_timestamp AS msg_timestamp,
                            m.cntrb_id AS msg_cntrb_id
                        FROM 
                            issue_message_ref imr,
                            issues i,
                            message m
                        WHERE 
                            i.issue_id = imr.issue_id AND 
                            imr.msg_id = m.msg_id
                    ) M
                    ON 
                        M.issue_id = i.issue_id
                WHERE 
                    i.repo_id in ({repo_statement})
                """)
df = pd.read_sql(query, con=engine)

df = df.reset_index()
df.drop("index", axis=1, inplace=True)

In [5]:
df

Unnamed: 0,issue_id,id,cntrb_id,msg_timestamp,msg_cntrb_id,created_at,closed_at
0,303640636,1,,NaT,,2025-04-30 20:11:24,NaT
1,303640642,1,,NaT,,2025-04-14 14:41:01,NaT
2,303640641,1,,2025-04-15 22:24:32,0102d4dd-e200-0000-0000-000000000000,2025-04-15 22:01:28,NaT
3,303640641,1,,2025-04-16 13:41:24,010005cb-c700-0000-0000-000000000000,2025-04-15 22:01:28,NaT
4,303640641,1,,2025-04-23 18:14:10,010005cb-c700-0000-0000-000000000000,2025-04-15 22:01:28,NaT
...,...,...,...,...,...,...,...
2286,233582926,1,,2024-11-04 16:17:29,01000ca1-8a00-0000-0000-000000000000,2024-10-08 15:53:44,NaT
2287,233582926,1,,2024-11-04 17:01:54,0102d4dd-e200-0000-0000-000000000000,2024-10-08 15:53:44,NaT
2288,233582926,1,,2024-11-04 17:05:59,01000ca1-8a00-0000-0000-000000000000,2024-10-08 15:53:44,NaT
2289,233582926,1,,2024-11-19 10:11:42,01000ca1-8a00-0000-0000-000000000000,2024-10-08 15:53:44,NaT


In [6]:
num_days = 2

In [7]:
import pandas as pd

# ---------- Helper ----------
def get_open_response(df_issues: pd.DataFrame, day, num_days: int):
    """
    For a given 'day', return:
      Open      = # issues open on that day
      Response  = # of those open issues that received a first response
                  within num_days of opening.

    Assumes df_issues has one row per issue with:
      created_at (datetime, tz-aware)
      closed_at  (datetime or NaT, tz-aware)
      msg_timestamp (first non-creator response timestamp, tz-aware)
    """
    day = pd.to_datetime(day, utc=True).normalize()

    # Open if created <= day and not closed before day
    open_mask = (df_issues["created_at"] <= day) & (
        df_issues["closed_at"].isna() | (df_issues["closed_at"] >= day)
    )
    open_count = int(open_mask.sum())

    # Responded within num_days of opening
    within_window = df_issues["msg_timestamp"] <= (
        df_issues["created_at"] + pd.to_timedelta(num_days, unit="D")
    )
    responded_count = int((open_mask & within_window).sum())

    return open_count, responded_count


# ---------- Prepare the issue-level frame (one row per issue) ----------
# Ensure datetimelike (tz-aware) columns
df["msg_timestamp"] = pd.to_datetime(df["msg_timestamp"], utc=True, errors="coerce")
df["created_at"]    = pd.to_datetime(df["created_at"],    utc=True, errors="coerce")
df["closed_at"]     = pd.to_datetime(df["closed_at"],     utc=True, errors="coerce")

# Keep only non-creator messages, then pick earliest response per issue
df_issues = (
    df[df["cntrb_id"] != df["msg_cntrb_id"]]
      .sort_values("msg_timestamp")
      .drop_duplicates(subset="issue_id", keep="first")
      .copy()
)

# Time bounds (handle NaT in closed_at safely)
earliest = df_issues["created_at"].min()
latest   = pd.concat([df_issues["created_at"], df_issues["closed_at"]]).max()

if pd.isna(earliest) or pd.isna(latest):
    raise ValueError("Unable to determine date range (check created_at/closed_at values).")

# ---------- Build the daily range and compute metrics ----------
dates = pd.date_range(start=earliest, end=latest, freq="D", inclusive="both")
df_responses = pd.DataFrame({"Date": dates})

num_days = 7  # response window
vals = [get_open_response(df_issues, d, num_days) for d in df_responses["Date"]]
df_responses["Open"], df_responses["Response"] = map(list, zip(*vals))

# ---------- (Optional) format Date as string YYYY-MM-DD ----------
# Force-cast in case this cell is re-run and Date is already strings
df_responses["Date"] = pd.to_datetime(df_responses["Date"], utc=True, errors="coerce")
df_responses["Date"] = df_responses["Date"].dt.tz_convert(None).dt.strftime("%Y-%m-%d")

# df_responses now has columns: Date (YYYY-MM-DD string), Open, Response

In [8]:
df_responses

Unnamed: 0,Date,Open,Response
0,2017-01-20,0,0
1,2017-01-21,3,1
2,2017-01-22,3,1
3,2017-01-23,3,1
4,2017-01-24,4,1
...,...,...,...
3062,2025-06-09,160,48
3063,2025-06-10,154,46
3064,2025-06-11,154,46
3065,2025-06-12,154,46


In [9]:
def get_open_response(df, date, num_days):
    """
    This function takes a date and determines how many
    issues in that time interval are opened and if they have a response within num_days.

    Args:
    -----
        df : Pandas Dataframe
            Dataframe with issues and their messages

        date : Datetime Timestamp
            Timestamp of the date

        num_days : int
            number of days that a response should be within

    Returns:
    --------
        int, int: Number of opened and responded to issues within num_days on the day
    """
    # drop rows that are more recent than the date limit
    df_created = df[df["created_at"] <= date]

    # drops rows that have been closed after date
    df_open = df_created[df_created["closed_at"] > date]

    # include issues that have not been close yet
    df_open = pd.concat([df_open, df_created[df_created.closed_at.isnull()]])

    # column to hold date num_days after the issue_creation date for comparision
    df_open["response_by"] = df_open["created_at"] + pd.DateOffset(days=num_days)

    # Inlcude only the issues that msg timestamp is before the responded by time
    df_response = df_open[df_open["msg_timestamp"] < df_open["response_by"]]

    # generates number of columns ie open issues
    num_open = df_open.shape[0]

    # number of issues that had response in time interval
    num_response = df_response.shape[0]
    return num_open, num_response

In [10]:
fig = go.Figure(
        [
            go.Scatter(
                name="Issues Open",
                x=df_responses["Date"],
                y=df_responses["Open"],
                mode="lines",
                showlegend=True,
                hovertemplate="Issues Open: %{y}<br>%{x|%b %d, %Y} <extra></extra>",
                #marker=dict(color=color_seq[1]),
            ),
            go.Scatter(
                name="Response <" + str(num_days) + " days",
                x=df_responses["Date"],
                y=df_responses["Response"],
                mode="lines",
                showlegend=True,
                hovertemplate="Issues: %{y}<br>%{x|%b %d, %Y} <extra></extra>",
                #marker=dict(color=color_seq[5]),
            ),
        ]
    )

fig.update_layout(
        xaxis_title="Time",
        yaxis_title="Number of Issues",
        font=dict(size=14),
        title = "Issue First Response"
    )