**This is essentailly the same logistic regression model used on the promise dataset, but using the GHRP dataset, since the preproccessing was different we just have them in different ipynb files**

In [106]:
%%capture
%pip install numpy as np
%pip install pandas as pd
%pip install sklearn


In [107]:
import numpy as np 
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
df = pd.read_csv('dataset/GHRP_dataset_csv/baseline.csv')

In [108]:
raw_features = df.iloc[:, 1:-1].values.astype(float)
y = df.iloc[:,-1].values
test_size = 0.2
x_train, x_test,y_train,y_test = train_test_split(raw_features,y,test_size=test_size,random_state=42,stratify=y)

scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)



In [109]:

def sigmoid(z):
    return np.where(z >= 0, 
                    1 / (1 + np.exp(-z)), 
                    np.exp(z) / (1 + np.exp(z)))


def train_model(X,y,learning_rate,theta,theta_zero,epochs,reg):
    n, d = X.shape
    y = y.reshape(-1,1)
    

    for i in range(epochs):
        z = np.dot(X,theta) + theta_zero
        p = sigmoid(z)

        error = p-y


        d_theta = (1/n)*np.dot(X.T, error) + reg*np.sign(theta)
        d_theta_zero = (1/n)*np.sum(p-y)

        theta -= learning_rate*d_theta
        theta_zero -= learning_rate*d_theta_zero
    return theta, theta_zero

## Training

In [None]:
def calculate_accuracy(X_test,y_test,theta,theta_zero):
    z = np.dot(X_test,theta) + theta_zero
    y_test=y_test.reshape(-1,1)
    predictions = (sigmoid(z)>=0.5)
    accuracy = (predictions==y_test).mean()



    return accuracy


In [111]:
n, d = X_train.shape

eta = 0.2
theta = np.zeros(d).reshape(-1,1)
theta_zero = 0 
l1_reg = 0.001
epochs = 50000

theta,theta_zero = train_model(X_train,y_train,eta,theta,theta_zero,epochs,l1_reg)

In [112]:
accuracy = calculate_accuracy(X_test,y_test,theta,theta_zero)
print("Accuracy:", round(accuracy*100,2),"%") 

Percentage of predicted 1s: 63.25%
Accuracy: 71.68 %


In [None]:
from IPython.display import display, Markdown


theta_flat = theta.flatten()

md_table = '### Feature Importance for Defect Prediction \n'
md_table += 'Metric Name | Weight Magnitude | Impact \n'
md_table += ' | :--- | :--- | :--- \n'

feature_names = df.columns[:-1]


importance = sorted(list(zip(feature_names, abs(theta_flat), np.sign(theta_flat))), key=lambda x: abs(x[1]), reverse=True)

for name, weight, sign in importance :

    impact = '✖ Little Impact' if (weight < 1e-3) else ('⬆ Probability' if sign>0 else '⬇ Probability')
    md_table += f"| {name} | {weight:.4f} | {impact}\n"

display(Markdown(md_table))

### Feature Importance for Defect Prediction 
Metric Name | Weight Magnitude | Impact 
 | :--- | :--- | :--- 
| totalFields | 2.1487 | ⬇ Probability
| wmc | 0.5788 | ⬇ Probability
| SHA | 0.5759 | ⬆ Probability
| dit | 0.3825 | ⬇ Probability
| nosi | 0.2674 | ⬆ Probability
| loc | 0.2404 | ⬆ Probability
| rfc | 0.1813 | ⬇ Probability
| assignmentsQty | 0.1390 | ⬆ Probability
| variablesQty | 0.1297 | ⬆ Probability
| loopQty | 0.1032 | ⬇ Probability
| parenthesizedExpsQty | 0.0565 | ⬆ Probability
| comparisonsQty | 0.0415 | ⬆ Probability
| tryCatchQty | 0.0372 | ⬆ Probability
| stringLiteralsQty | 0.0352 | ⬆ Probability
| lcom | 0.0279 | ⬇ Probability
| maxNestedBlocks | 0.0258 | ⬆ Probability
| mathOperationsQty | 0.0197 | ⬆ Probability
| totalMethods | 0.0101 | ⬇ Probability
| returnQty | 0.0002 | ✖ Little Impact
| cbo | 0.0001 | ✖ Little Impact
| numbersQty | 0.0001 | ✖ Little Impact
