In [71]:
# Import Necessary Libraries
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import multivariate_normal, bernoulli, beta, norm
from scipy.stats import ks_2samp
from scipy.special import expit as logistic_sigmoid
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.metrics import brier_score_loss
from sklearn.model_selection import cross_val_score
from sklearn.metrics import balanced_accuracy_score, brier_score_loss, accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler

np.random.seed(42)

In [72]:
YX_train = pd.read_csv('train_dataset.csv')
YX_test = pd.read_csv('test_dataset.csv')
Y = YX_train['fake_binary']
X = YX_train.drop(columns=['fake_binary'])

In [73]:
# Combine features and target into a single DataFrame for resampling
df_combined = X.copy()
df_combined['fake_binary'] = Y

In [74]:
# Apply SMOTE to balance the 'party_detailed' variable along with other features and target
smote = SMOTE(random_state=42)
df_resampled, target_resampled = smote.fit_resample(df_combined, df_combined['party_detailed'])

# Ensure the target variable is correctly aligned
#df_resampled['fake_binary'] = target_resampled


In [75]:
df_resampled['party_detailed'].value_counts()

party_detailed
1    800
0    800
Name: count, dtype: int64

In [76]:
df_resampled['fake_binary'].value_counts()

fake_binary
0    1174
1     426
Name: count, dtype: int64

In [77]:
import pickle

# Fit the logistic regression model on the resampled data
model = sm.Logit(df_resampled['fake_binary'], df_resampled.drop(columns=['fake_binary'])).fit_regularized(method='l1')
print("SMOTE Model Summary:")
print(model.summary())
pickle.dump(model, open('regression_model_Bias.sav', 'wb'))

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.48877103289532753
            Iterations: 191
            Function evaluations: 191
            Gradient evaluations: 191
SMOTE Model Summary:
                           Logit Regression Results                           
Dep. Variable:            fake_binary   No. Observations:                 1600
Model:                          Logit   Df Residuals:                     1564
Method:                           MLE   Df Model:                           35
Date:                Sun, 02 Jun 2024   Pseudo R-squ.:                  0.1566
Time:                        16:35:26   Log-Likelihood:                -782.03
converged:                       True   LL-Null:                       -927.19
Covariance Type:            nonrobust   LLR p-value:                 5.615e-42
                              coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------

In [78]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, Y)

# Convert resampled data back to DataFrame for compatibility with statsmodels
X_resampled = pd.DataFrame(X_resampled, columns=X.columns)
y_resampled = pd.Series(y_resampled)

In [79]:
y_resampled.value_counts()

fake_binary
1    1055
0    1055
Name: count, dtype: int64

In [80]:
# Fit the logistic regression model on the resampled data
model_smote = sm.Logit(y_resampled, X_resampled).fit_regularized(method='l1')
print("SMOTE Model Summary:")
print(model_smote.summary())
pickle.dump(model_smote, open('regression_model_Bias2.sav', 'wb'))

Optimization terminated successfully    (Exit mode 0)
            Current function value: 0.5700030904332745
            Iterations: 219
            Function evaluations: 219
            Gradient evaluations: 219
SMOTE Model Summary:
                           Logit Regression Results                           
Dep. Variable:            fake_binary   No. Observations:                 2110
Model:                          Logit   Df Residuals:                     2074
Method:                           MLE   Df Model:                           35
Date:                Sun, 02 Jun 2024   Pseudo R-squ.:                  0.1777
Time:                        16:35:30   Log-Likelihood:                -1202.7
converged:                       True   LL-Null:                       -1462.5
Covariance Type:            nonrobust   LLR p-value:                 1.241e-87
                              coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------