In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset
data = pd.read_csv("census_preprocessed_dataset.csv")

# Separate features (X) and target variable (y)
X = data.drop("income", axis=1)
y = data["income"]

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optionally, you can save the split datasets to CSV files
X_train.to_csv("census_train_features.csv", index=False)
X_test.to_csv("census_test_features.csv", index=False)
y_train.to_csv("census_train_target.csv", index=False)
y_test.to_csv("census_test_target.csv", index=False)

# Print the shapes of the resulting datasets
print("Training set:")
print("Features:", X_train.shape)
print("Target:", y_train.shape)
print("\nTesting set:")
print("Features:", X_test.shape)
print("Target:", y_test.shape)


Training set:
Features: (11454, 109)
Target: (11454,)

Testing set:
Features: (4909, 109)
Target: (4909,)


In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

# Load the preprocessed dataset
data = pd.read_csv("census_preprocessed_dataset.csv")

# Drop rows with missing target values
data = data.dropna(subset=["income"])

# Separate features (X) and target variable (y)
X = data.drop("income", axis=1)
y = data["income"]

# Split the dataset into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Impute missing values in features
imputer = SimpleImputer(strategy="mean")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

# Define a function to evaluate each model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC()
}

# Evaluate each model
results = {}
for name, model in models.items():
    print(f"Evaluating {name}...")
    if name == "Random Forest":  # Use a pipeline for RandomForestClassifier with imputation
        model_pipeline = make_pipeline(imputer, model)
        accuracy, precision, recall, f1 = evaluate_model(model_pipeline, X_train, X_test, y_train, y_test)
    else:
        accuracy, precision, recall, f1 = evaluate_model(model, X_train_imputed, X_test_imputed, y_train, y_test)
    results[name] = {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1 Score": f1
    }

# Print results
print("\nResults:")
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)


Evaluating Logistic Regression...
Evaluating Random Forest...
Evaluating Support Vector Machine...

Results:
                        Accuracy  Precision    Recall  F1 Score
Logistic Regression     1.000000   1.000000  1.000000  1.000000
Random Forest           1.000000   1.000000  1.000000  1.000000
Support Vector Machine  0.990366   0.998316  0.961102  0.979356


In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# Load the preprocessed dataset
data = pd.read_csv("census_preprocessed_dataset.csv")

# Drop rows with missing target values
data = data.dropna(subset=["income"])

# Separate features (X) and target variable (y)
X = data.drop("income", axis=1)
y = data["income"]

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC()
}

# Define K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Evaluate each model using cross-validation
results = {}
for name, model in models.items():
    print(f"Evaluating {name}...")
    if name == "Random Forest":  # Use a pipeline for RandomForestClassifier with imputation
        model_pipeline = make_pipeline(imputer, model)
        scores = cross_val_score(model_pipeline, X_imputed, y, cv=kf, scoring="accuracy")
    else:
        scores = cross_val_score(model, X_imputed, y, cv=kf, scoring="accuracy")
    results[name] = {
        "Mean Accuracy": scores.mean(),
        "Standard Deviation": scores.std(),
        "Accuracy Scores": scores
    }

# Print results summary
print("\nResults Summary:")
for name, result in results.items():
    print(f"\n{name}:")
    print(f"Mean Accuracy: {result['Mean Accuracy']:.4f}")
    print(f"Standard Deviation: {result['Standard Deviation']:.4f}")
    print("Accuracy Scores:", result["Accuracy Scores"])


Evaluating Logistic Regression...
Evaluating Random Forest...
Evaluating Support Vector Machine...

Results Summary:

Logistic Regression:
Mean Accuracy: 1.0000
Standard Deviation: 0.0000
Accuracy Scores: [1. 1. 1. 1. 1.]

Random Forest:
Mean Accuracy: 0.9999
Standard Deviation: 0.0001
Accuracy Scores: [0.99973918 1.         1.         1.         1.        ]

Support Vector Machine:
Mean Accuracy: 0.9888
Standard Deviation: 0.0006
Accuracy Scores: [0.98852374 0.98904538 0.98956431 0.98773806 0.98930342]


In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline

# Load the preprocessed dataset
data = pd.read_csv("census_preprocessed_dataset.csv")

# Drop rows with missing target values
data = data.dropna(subset=["income"])

# Separate features (X) and target variable (y)
X = data.drop("income", axis=1)
y = data["income"]

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Machine": SVC()
}

# Define K-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Define metrics to calculate
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Evaluate each model using cross-validation
results = {}
for name, model in models.items():
    print(f"Evaluating {name}...")
    if name == "Random Forest":  # Use a pipeline for RandomForestClassifier with imputation
        model_pipeline = make_pipeline(imputer, model)
        cv_results = cross_validate(model_pipeline, X_imputed, y, cv=kf, scoring=scoring)
    else:
        cv_results = cross_validate(model, X_imputed, y, cv=kf, scoring=scoring)
    results[name] = {
        "Accuracy": cv_results['test_accuracy'],
        "Precision": cv_results['test_precision'],
        "Recall": cv_results['test_recall'],
        "F1 Score": cv_results['test_f1']
    }

# Print results summary
print("\nResults Summary:")
for name, result in results.items():
    print(f"\n{name}:")
    for metric, values in result.items():
        print(f"{metric}: Mean={values.mean():.4f}, Std Dev={values.std():.4f}")


Evaluating Logistic Regression...
Evaluating Random Forest...
Evaluating Support Vector Machine...

Results Summary:

Logistic Regression:
Accuracy: Mean=1.0000, Std Dev=0.0000
Precision: Mean=1.0000, Std Dev=0.0000
Recall: Mean=1.0000, Std Dev=0.0000
F1 Score: Mean=1.0000, Std Dev=0.0000

Random Forest:
Accuracy: Mean=1.0000, Std Dev=0.0000
Precision: Mean=1.0000, Std Dev=0.0000
Recall: Mean=1.0000, Std Dev=0.0000
F1 Score: Mean=1.0000, Std Dev=0.0000

Support Vector Machine:
Accuracy: Mean=0.9958, Std Dev=0.0005
Precision: Mean=0.9975, Std Dev=0.0004
Recall: Mean=0.9940, Std Dev=0.0009
F1 Score: Mean=0.9958, Std Dev=0.0005
