In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import gradio as gr

# Load dataset
data = pd.read_csv("adult.csv")

# Drop rows with missing values ("?" used as placeholder)
data.replace('?', np.nan, inplace=True)
data.dropna(inplace=True)

# Encode categorical variables
label_encoders = {}
categorical_cols = data.select_dtypes(include=['object']).columns
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Features and target
X = data.drop('income', axis=1)
y = data['income']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model on the training dataset
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# Calculate mean/median for missing numerical features
mean_fnlwgt = X['fnlwgt'].mean()
mean_educational_num = X['educational-num'].mean()
mean_capital_gain = X['capital-gain'].mean()
mean_capital_loss = X['capital-loss'].mean()


# For interface: define categorical options from label encoders
def get_options(col):
    le = label_encoders[col]
    return list(le.classes_)

# Prediction function
def predict_salary(age, workclass, education, marital_status, occupation, relationship, race, gender, hours_per_week, native_country):
    input_dict = {
        'age': int(age),
        'workclass': label_encoders['workclass'].transform([workclass])[0],
        'fnlwgt': mean_fnlwgt,
        'education': label_encoders['education'].transform([education])[0],
        'educational-num': mean_educational_num,
        'marital-status': label_encoders['marital-status'].transform([marital_status])[0],
        'occupation': label_encoders['occupation'].transform([occupation])[0],
        'relationship': label_encoders['relationship'].transform([relationship])[0],
        'race': label_encoders['race'].transform([race])[0],
        'gender': label_encoders['gender'].transform([gender])[0],
        'capital-gain': mean_capital_gain,
        'capital-loss': mean_capital_loss,
        'hours-per-week': int(hours_per_week),
        'native-country': label_encoders['native-country'].transform([native_country])[0]
    }
    input_df = pd.DataFrame([input_dict])

    # Ensure the order of columns matches the training data
    input_df = input_df[X.columns]


    prediction = model.predict(input_df)[0]
    return ">50K" if prediction == 1 else "<=50K"

# Gradio interface
interface = gr.Interface(
    fn=predict_salary,
    inputs=[
        gr.Number(label="Age"),
        gr.Dropdown(get_options('workclass'), label="Workclass"),
        gr.Dropdown(get_options('education'), label="Education"),
        gr.Dropdown(get_options('marital-status'), label="Marital Status"),
        gr.Dropdown(get_options('occupation'), label="Occupation"),
        gr.Dropdown(get_options('relationship'), label="Relationship"),
        gr.Dropdown(get_options('race'), label="Race"),
        gr.Dropdown(get_options('gender'), label="Gender"),
        gr.Number(label="Hours per Week"),
        gr.Dropdown(get_options('native-country'), label="Native Country")
    ],
    outputs=gr.Text(label="Predicted Salary Class"),
    title="Employee Salary Prediction",
    description="Predict whether an employee earns >50K or <=50K based on their profile."
)

interface.launch()

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://53e12bbcd6cfde54da.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [10]:
# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")

Model Accuracy: 0.8560530679933664


In [11]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best parameters found: {grid_search.best_params_}")

# Update the model with the best parameters
model = grid_search.best_estimator_

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best parameters found: {'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 200}


In [12]:
# Make predictions on the test set with the tuned model
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy after tuning: {accuracy}")

Model Accuracy after tuning: 0.867551133222775
