In [1]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.decomposition import PCA

Matplotlib is building the font cache; this may take a moment.


In [2]:
commit_data = 'commit_data_average.csv'
export_csv_name = 'test.csv'

In [3]:
df = pd.read_csv(commit_data, parse_dates=['commit_date'])
df = df.sort_values(['url','developer','commit_date'])

df['time_since_last_h'] = (
    df.groupby(['url','developer'])['commit_date']
      .diff().dt.total_seconds() / 3600
).fillna(0)

df['month'] = df['commit_date'].dt.to_period('M').dt.to_timestamp()

agg = (
    df.groupby(['url','month'])
      .agg(
          churn_sum       = ('churn',       'sum'),
          commit_count    = ('commit_date', 'count'),
          dev_count       = ('developer',   'nunique'),
          sum_interval_h  = ('time_since_last_h', 'sum')
      )
      .reset_index()
)

agg['churn_per_dev']  = agg['churn_sum'] / agg['dev_count']
agg['commit_rate']    = agg['commit_count'] / (agg['sum_interval_h'] + 1e-6)

features = ['churn_sum','commit_count','dev_count','churn_per_dev','commit_rate']
rscaler = RobustScaler()
X_robust = rscaler.fit_transform(agg[features])
agg[features] = X_robust

mm = MinMaxScaler()
agg = agg.groupby('url', group_keys=False).apply(lambda g: g.assign(**{
    feat: mm.fit_transform(g[[feat]]).flatten()
    for feat in features
}))

pca = PCA(n_components=1)
agg['eng_raw'] = pca.fit_transform(agg[features])

print("PCA feature weights (component 1):")
for feat, weight in zip(features, pca.components_[0]):
    print(f"  {feat:15s}: {weight: .4f}")

agg['engagement_score'] = agg.groupby('url')['eng_raw'] \
                             .transform(lambda x: (x - x.min())/(x.max()-x.min()))

agg.to_csv(export_csv_name, index=False)


PCA feature weights (component 1):
  churn_sum      :  0.4571
  commit_count   :  0.5172
  dev_count      :  0.5771
  churn_per_dev  :  0.4078
  commit_rate    : -0.1556


  agg = agg.groupby('url', group_keys=False).apply(lambda g: g.assign(**{
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
