In [1]:
import pandas as pd
import numpy as np
import math

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score


from rpy2.robjects.packages import importr
from rpy2.robjects.conversion import localconverter
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri

import matplotlib.pyplot as plt

import SMOTE
import bellwether

In [2]:
clf = GaussianNB()
data_path = '/Users/suvodeepmajumder/Documents/AI4SE/bellwether_comminity/data/commit_guru'
bell = bellwether.bellwether(data_path,clf)

In [None]:
def data_processing(path):
    df = pd.read_csv(path)
    df = df.drop(labels = ['commit_hash', 'author_name', 'author_date_unix_timestamp',
           'author_email', 'author_date', 'commit_message','classification', 'linked', 'contains_bug', 'fixes',
                          'fileschanged','glm_probability', 'rf_probability',
           'repository_id', 'issue_id', 'issue_date', 'issue_type'],axis=1)
    df = df.dropna()
    df = df[['ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age',
           'nuc', 'exp', 'rexp', 'sexp','fix']]
    smt = SMOTE.smote(df)
    df = smt.run()
    df.columns = ['ns', 'nd', 'nf', 'entropy', 'la', 'ld', 'lt', 'ndev', 'age',
           'nuc', 'exp', 'rexp', 'sexp','fix']
    y = df.fix
    y=y.astype('bool')
    X = df.drop(labels=['fix'],axis=1)
    train_X,test_X,train_y,test_y = train_test_split(X, y, test_size=0.33, random_state=42)
    return train_X,test_X,train_y,test_y

In [None]:
def model_building(train_X,test_X,train_y,test_y):
    clf = LogisticRegression(penalty='l1')
    clf.fit(train_X,train_y)
    predicted = clf.predict(test_X)
    f1 = f1_score(test_y, predicted, average='weighted')
    #fpr, tpr, thresholds = metrics.roc_curve(test_y, predicted, pos_label=1)
    return f1

In [None]:
results = []
for s_project in bell.projects:
    try:
        train_X,test_X,train_y,test_y = data_processing(s_project)
        f1 = model_building(train_X,test_X,train_y,test_y)
        print(s_project,f1) 
        results.append([s_project,f1])
    except:
        continue

In [None]:
df = pd.DataFrame(results, columns = ['file','f1'])
df.to_csv('results.csv')

In [3]:
bell.get_baseline()

0.5851648351648351
0.5678596908807991
0.5294758692267774
0.5972222222222222
0.4995302981925637
0.4999887960067757
0.5118283881315157
0.608999125057756
0.6152455311280051
0.588153181626633
0.5107540426930292
0.4230769230769231
0.4990215264187867
0.501601514158841
0.515746292542409
0.5
0.5077559061225123
0.6830357142857143
0.5879032258064516
0.5722222222222222
0.6006640666174765
0.4791666666666667
0.5009684139257657
0.5083734404693051
0.526914570603891
0.5116497406716205
0.498812351543943
0.6737037037037037
0.5298453533747651
0.49952335557673977
0.5102960102960103
0.5104761904761905
0.7482758620689656
0.6333333333333334
0.487012987012987
0.554177005789909
0.5
0.4642857142857143
0.5
0.5279503105590062
0.5
0.3888888888888889
0.5
0.5390593047034764
0.5
0.5473484848484848
0.499213217938631
0.5222222222222223
0.6832807249770899
0.5216429911551863
0.4988876529477197
0.6957303370786517
0.6861167002012072
0.5008437677796578
nan
0.44696969696969696
0.8035714285714285
0.5180277349768876
0.50510204