In [None]:
import pandas as pd
import numpy as np

dfr=pd.read_csv('households_complex.csv')

dfr

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
plt.figure(figsize=(20,12))
sns.heatmap(dfr.isna(),cbar=False)

In [None]:
dfr.drop_duplicates(inplace=True)
toLabel=dfr.select_dtypes(include="object").columns.tolist()
toLabel

from sklearn.preprocessing import OrdinalEncoder
encoder=OrdinalEncoder()
dfr[toLabel]=encoder.fit_transform(dfr[toLabel])

In [None]:
Target="vulnerability_flag"
plt.figure(figsize=(4,12))
sns.heatmap(dfr.corr(method="spearman")[[Target]].sort_values(by=Target,ascending=False),cbar=True,annot=True)
plt.show()

In [None]:
dfr.describe()
#print(dfr.isna().sum())
dfr.dropna(axis=0,inplace=True)
dfr

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(dfr.isna(),cbar=False)

In [None]:
dfr[Target].value_counts().plot.bar()
dfr[Target].value_counts(normalize=True)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

xcolumns=[c for c in dfr.columns if c not in ["vulnerability_flag","household_id","pmt_score"]]
X=dfr[xcolumns]
y=dfr[Target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,classification_report
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline

selector = SelectKBest(f_classif, k=8)
selector.fit(X, y)

selected_features_mask = selector.get_support()
selected_feature_names = X.columns[selected_features_mask].tolist()
print(selected_feature_names)




final_model = Pipeline([
    ("select", SelectKBest(f_classif, k=8)),
    ("clf", LogisticRegression())
])

print()
final_model.fit(X_train, y_train)
pred=final_model.predict(X_test)
#pred_proba = final_model.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, pred)
precision = precision_score(y_test, pred,average='weighted')
recall = recall_score(y_test, pred,average='weighted')
f1 = f1_score(y_test, pred,average='weighted')
#auc = roc_auc_score(y_test, pred_proba)

print("Accuracy:", accuracy)
print("Precision:", precision, )
print("Recall:", recall)
print("F1-score:", f1)
#print("AUC:", auc)
print(classification_report(y_test, pred))


In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
ConfusionMatrixDisplay.from_estimator(final_model, X, y)
plt.show()

In [None]:
from sklearn.model_selection import learning_curve
train_sizes, train_scores, test_scores = learning_curve(
        final_model, X_train, y_train, cv=5, n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10) # Varying training set sizes
    )

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)




plt.figure(figsize=(10, 6))
plt.plot(train_sizes, train_scores_mean, label="Training score", color="blue", marker='o')
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="blue")
plt.plot(train_sizes, test_scores_mean, label="Cross-validation score", color="red", marker='o')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="red")

plt.title("Learning Curve")
plt.xlabel("Training Examples")
plt.ylabel("Score")
plt.legend(loc="best")
plt.grid(True)
plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import learning_curve

tree = DecisionTreeClassifier(max_depth=4, random_state=42)
train_size_abs, train_scores, test_scores = learning_curve(
    final_model, X, y, train_sizes=[0.3, 0.6, 0.9]
)
for train_size, cv_train_scores, cv_test_scores in zip(
    train_size_abs, train_scores, test_scores
):
    print(f"{train_size} samples were used to train the model")
    print(f"The average train accuracy is {cv_train_scores.mean():.2f}")
    print(f"The average test accuracy is {cv_test_scores.mean():.2f}")

In [None]:
coef_df = pd.DataFrame({
    "feature": X.columns[selected_features_mask],
    "coef": final_model.named_steps["clf"].coef_[0],
    "intercept": final_model.named_steps['clf'].intercept_[0]
}).sort_values(by="coef", ascending=False)

coef_df

In [None]:
dfr["predicted_pmt_score"] = final_model.predict_proba(X)[:, 1]
dfr["predicted_eligible"] = (dfr["predicted_pmt_score"] >= 0.60).astype(int)

In [None]:
eligibility_list = dfr[["household_id", "predicted_pmt_score", "predicted_eligible"]]
eligibility_list

In [None]:
eligibility_list.to_csv("eligibility_output.csv", index=False)