In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [3]:
df = pd.read_csv('SoftwareDefect.csv')
print(df.shape)
df.head()


(10885, 22)


Unnamed: 0,loc,v(g),ev(g),iv(g),n,v,l,d,i,e,...,lOCode,lOComment,lOBlank,locCodeAndComment,uniq_Op,uniq_Opnd,total_Op,total_Opnd,branchCount,defects
0,1.1,1.4,1.4,1.4,1.3,1.3,1.3,1.3,1.3,1.3,...,2,2,2,2,1.2,1.2,1.2,1.2,1.4,False
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1,1,1,1,1.0,1.0,1.0,1.0,1.0,True
2,72.0,7.0,1.0,6.0,198.0,1134.13,0.05,20.31,55.85,23029.1,...,51,10,8,1,17.0,36.0,112.0,86.0,13.0,True
3,190.0,3.0,1.0,3.0,600.0,4348.76,0.06,17.06,254.87,74202.67,...,129,29,28,2,17.0,135.0,329.0,271.0,5.0,True
4,37.0,4.0,1.0,4.0,126.0,599.12,0.06,17.19,34.86,10297.3,...,28,1,6,0,11.0,16.0,76.0,50.0,7.0,True


In [5]:
print(df.info())
print(df.describe())
print(df.isnull().sum())

#print(df['target'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10885 entries, 0 to 10884
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   loc                10885 non-null  float64
 1   v(g)               10885 non-null  float64
 2   ev(g)              10885 non-null  float64
 3   iv(g)              10885 non-null  float64
 4   n                  10885 non-null  float64
 5   v                  10885 non-null  float64
 6   l                  10885 non-null  float64
 7   d                  10885 non-null  float64
 8   i                  10885 non-null  float64
 9   e                  10885 non-null  float64
 10  b                  10885 non-null  float64
 11  t                  10885 non-null  float64
 12  lOCode             10885 non-null  int64  
 13  lOComment          10885 non-null  int64  
 14  lOBlank            10885 non-null  int64  
 15  locCodeAndComment  10885 non-null  int64  
 16  uniq_Op            108

In [8]:
plt.figure(figsize=(5,4))
sns.countplot(x='defective', data=df)
plt.show()

plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), cmap='coolwarm')
plt.show()


ValueError: Could not interpret value `defective` for `x`. An entry with this name does not appear in `data`.

<Figure size 500x400 with 0 Axes>

In [9]:
df.dropna(inplace=True)

In [10]:
X = df.drop('defective', axis=1)
y = df['defective']

KeyError: "['defective'] not found in axis"

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

In [11]:
models = {
    'Logistic Regression': LogisticRegression(class_weight='balanced', max_iter=2000),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced'),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    print(name, accuracy_score(y_val, preds), roc_auc_score(y_val, model.predict_proba(X_val)[:,1]))


NameError: name 'RandomForestClassifier' is not defined

In [None]:
param_grid = {
    'n_estimators': [100,200],
    'max_depth': [None,10,20]
}
grid = GridSearchCV(RandomForestClassifier(class_weight='balanced'), param_grid, cv=5, scoring='roc_auc')
grid.fit(X_train, y_train)
print(grid.best_params_)


In [None]:
best_model = grid.best_estimator_

In [None]:
y_pred = best_model.predict(X_val)
print(classification_report(y_val, y_pred))
print('ROC AUC:', roc_auc_score(y_val, best_model.predict_proba(X_val)[:,1]))

In [None]:
best_model.fit(X_scaled, y)