In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import boxcox
pd.set_option('display.max_columns', None)
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from sklearn.model_selection import train_test_split,cross_val_score, KFold,GridSearchCV,StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,make_scorer
from imblearn.over_sampling import SMOTE


In [2]:
df = pd.read_csv('hiii/final_breast_cancer.csv')

In [20]:
X = df.drop('Status', axis = 1)
y = df['Status']
# f1_scorer = make_scorer(f1_score, average='macro')
model = LogisticRegression(max_iter=10000, penalty = 'l2', C = 10, solver = 'liblinear', random_state=101)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, stratify=y)
scaler = StandardScaler()

poly_converter = PolynomialFeatures(degree=1, include_bias=False, interaction_only=True)
X_train_poly = poly_converter.fit_transform(X_train)
X_test_poly = poly_converter.transform(X_test)
X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)


model.fit(X_train_scaled, y_train)
preds = model.predict(X_test_scaled)
accuracy_score(y_test, preds)

0.9031055900621118

In [21]:
print(classification_report(y_test,preds))


              precision    recall  f1-score   support

           0       0.92      0.97      0.94       682
           1       0.75      0.55      0.64       123

    accuracy                           0.90       805
   macro avg       0.84      0.76      0.79       805
weighted avg       0.90      0.90      0.90       805



In [22]:
accuracy_score(y_test,preds)

0.9031055900621118

In [14]:
precision_score(y_test, preds, average = 'macro')

np.float64(0.8351109674639087)

# SMOTE

In [17]:
X = df.drop('Status', axis = 1)
y = df['Status']
smote = SMOTE(random_state=101)
model = LogisticRegression(max_iter=10000, penalty = 'l2', C = 10, solver = 'liblinear', random_state=101, class_weight='balanced')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, stratify=y)
scaler = StandardScaler()

poly_converter = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_train_poly = poly_converter.fit_transform(X_train)
X_test_poly = poly_converter.transform(X_test)
X_train_poly, y_train_resampled = smote.fit_resample(X_train_poly, y_train)

X_train_scaled = scaler.fit_transform(X_train_poly)
X_test_scaled = scaler.transform(X_test_poly)


model.fit(X_train_scaled, y_train_resampled)
preds = model.predict(X_test_scaled)
accuracy_score(y_test, preds)

0.7875776397515528

In [16]:
print(classification_report(y_test,preds))


              precision    recall  f1-score   support

           0       0.93      0.81      0.87       682
           1       0.39      0.67      0.49       123

    accuracy                           0.79       805
   macro avg       0.66      0.74      0.68       805
weighted avg       0.85      0.79      0.81       805

