In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("income.csv")

In [3]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,40,Self-emp-not-inc,223881,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,99999,0,70,United-States,>50K
1,30,Private,149118,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,0,0,40,United-States,<=50K
2,46,Private,109209,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,>50K
3,32,Private,229566,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,0,0,60,United-States,>50K
4,54,?,148657,Preschool,1,Married-civ-spouse,?,Wife,White,Female,0,0,40,Mexico,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,40,Private,130834,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
24996,31,Local-gov,33124,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,<=50K
24997,38,Federal-gov,190895,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,?,>50K
24998,23,Private,420973,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K


In [7]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object

In [20]:
columns_to_convert = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']

# Convert the object columns to string data type
df[columns_to_convert] = df[columns_to_convert].applymap(str)

# Check the data types after conversion
print(df.dtypes)

age                int64
workclass         object
fnlwgt             int64
education         object
education.num      int64
marital.status    object
occupation        object
relationship      object
race              object
sex               object
capital.gain       int64
capital.loss       int64
hours.per.week     int64
native.country    object
income            object
dtype: object


In [37]:
(df == '?').sum()

age                  0
workclass         1429
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1434
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     437
income               0
dtype: int64

In [41]:
columns_with_question_mark = ['workclass', 'occupation', 'native.country']

# Create a copy of the dataset with rows containing '?' removed
df_copy = df.drop(df[df[columns_with_question_mark].isin(['?']).any(axis=1)].index).copy()

# Check the number of rows in the original and copy datasets
print("Number of rows in the original dataset:", len(df))
print("Number of rows in the copy dataset:", len(df_copy))

Number of rows in the original dataset: 25000
Number of rows in the copy dataset: 23149


In [40]:
23149/25000

0.92596

In [44]:
df['workclass'].unique()

array(['Self-emp-not-inc', 'Private', '?', 'Local-gov', 'Self-emp-inc',
       'State-gov', 'Federal-gov', 'Without-pay', 'Never-worked'],
      dtype=object)

### Code below shows removing the missing values has little effect on changing the message of the df

In [49]:
def calculate_percentage(counts):
    total = counts.sum()
    return (counts / total * 100).round(2)

# List of columns that are of string data type
string_columns = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income']

unique_values_info = []
for column in string_columns:
    unique_values_df = df[column].value_counts()
    unique_values_df_copy = df_copy[column].value_counts()

    percentages_df = calculate_percentage(unique_values_df)
    percentages_df_copy = calculate_percentage(unique_values_df_copy)

    unique_values_info.append((column, unique_values_df, unique_values_df_copy, percentages_df, percentages_df_copy))

# Display the results
for column, unique_values_df, unique_values_df_copy, percentages_df, percentages_df_copy in unique_values_info:
    print(f"{column}")
    for value, count_df, count_df_copy, percent_df, percent_df_copy in zip(unique_values_df.index, unique_values_df, unique_values_df_copy, percentages_df, percentages_df_copy):
        print(f"{value}: {count_df} : {count_df_copy}   ({percent_df}% : {percent_df_copy}%)")
    print("=" * 50)

# Calculate the mean difference in percentages
mean_percent_diff = sum(abs(percentages_df - percentages_df_copy)) / len(percentages_df)
print("Mean Percentage Difference:", mean_percent_diff)


workclass
Private: 17471 : 17157   (69.88% : 74.12%)
Self-emp-not-inc: 1935 : 1908   (7.74% : 8.24%)
Local-gov: 1553 : 1535   (6.21% : 6.63%)
?: 1429 : 992   (5.72% : 4.29%)
State-gov: 1004 : 820   (4.02% : 3.54%)
Self-emp-inc: 851 : 725   (3.4% : 3.13%)
Federal-gov: 740 : 12   (2.96% : 0.05%)
education
HS-grad: 8025 : 7520   (32.1% : 32.49%)
Some-college: 5621 : 5144   (22.48% : 22.22%)
Bachelors: 4104 : 3873   (16.42% : 16.73%)
Masters: 1301 : 1231   (5.2% : 5.32%)
Assoc-voc: 1063 : 1008   (4.25% : 4.35%)
11th: 905 : 805   (3.62% : 3.48%)
Assoc-acdm: 809 : 760   (3.24% : 3.28%)
10th: 736 : 647   (2.94% : 2.79%)
7th-8th: 476 : 420   (1.9% : 1.81%)
Prof-school: 449 : 410   (1.8% : 1.77%)
9th: 403 : 351   (1.61% : 1.52%)
12th: 344 : 301   (1.38% : 1.3%)
Doctorate: 328 : 298   (1.31% : 1.29%)
5th-6th: 265 : 228   (1.06% : 0.98%)
1st-4th: 135 : 121   (0.54% : 0.52%)
Preschool: 36 : 32   (0.14% : 0.14%)
marital.status
Married-civ-spouse: 11518 : 10830   (46.07% : 46.78%)
Never-married: 820

In [51]:
df = df_copy

In [52]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,40,Self-emp-not-inc,223881,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,99999,0,70,United-States,>50K
1,30,Private,149118,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,0,0,40,United-States,<=50K
2,46,Private,109209,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,>50K
3,32,Private,229566,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,0,0,60,United-States,>50K
5,63,Private,111963,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,16,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,70,Private,278139,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,3432,0,40,United-States,<=50K
24995,40,Private,130834,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,<=50K
24996,31,Local-gov,33124,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,50,United-States,<=50K
24998,23,Private,420973,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,0,0,40,United-States,<=50K


In [54]:
df['capital.gain'].value_counts()

capital.gain
0        21174
15024      265
7688       209
7298       176
99999      110
         ...  
2538         1
1173         1
2387         1
22040        1
6097         1
Name: count, Length: 116, dtype: int64

In [55]:
df['capital.loss'].value_counts()

capital.loss
0       22049
1902      147
1887      125
1977      119
1848       40
        ...  
1411        1
155         1
1755        1
2080        1
2467        1
Name: count, Length: 88, dtype: int64

In [56]:
columns_to_remove = ['capital.gain', 'capital.loss']
df = df.drop(columns=columns_to_remove)


In [57]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,income
0,40,Self-emp-not-inc,223881,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,70,United-States,>50K
1,30,Private,149118,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,40,United-States,<=50K
2,46,Private,109209,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,40,United-States,>50K
3,32,Private,229566,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,60,United-States,>50K
5,63,Private,111963,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,16,United-States,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,70,Private,278139,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,40,United-States,<=50K
24995,40,Private,130834,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,40,United-States,<=50K
24996,31,Local-gov,33124,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,50,United-States,<=50K
24998,23,Private,420973,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,40,United-States,<=50K


In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [59]:
df['income'].unique()

array(['>50K', '<=50K'], dtype=object)

In [64]:
def map_income_hours(row):
    if row['hours.per.week'] < 40 and row['income'] == '<=50K':
        return 0
    elif row['hours.per.week'] > 40 and row['income'] == '<=50K':
        return 1
    elif row['hours.per.week'] < 40 and row['income'] == '>50K':
        return 2
    elif row['hours.per.week'] > 40 and row['income'] == '>50K':
        return 4
    else:
        return -1  # Default value when none of the conditions are met

# Create the new column using the apply method
df['new_column'] = df.apply(map_income_hours, axis=1)

In [74]:
df['hours.per.week/education.num'] = df['hours.per.week']/df['education.num']

In [76]:
df['age/education.num'] = df['age']/df['education.num']

In [77]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,income,new_column,hours.per.week/education.num,age/education.num
0,40,Self-emp-not-inc,223881,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,70,United-States,>50K,4,4.666667,2.666667
1,30,Private,149118,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,40,United-States,<=50K,-1,4.444444,3.333333
2,46,Private,109209,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,40,United-States,>50K,-1,4.000000,4.600000
3,32,Private,229566,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,60,United-States,>50K,4,5.454545,2.909091
5,63,Private,111963,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,16,United-States,<=50K,0,1.600000,6.300000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,70,Private,278139,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,40,United-States,<=50K,-1,4.444444,7.777778
24995,40,Private,130834,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,40,United-States,<=50K,-1,4.000000,4.000000
24996,31,Local-gov,33124,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,50,United-States,<=50K,1,3.846154,2.384615
24998,23,Private,420973,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,40,United-States,<=50K,-1,3.076923,1.769231


In [78]:
label_encoder = LabelEncoder()
categorical_columns = ['workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country', 'income', 'new_column', 'hours.per.week/education.num',	'age/education.num']

In [79]:
train_df = df.copy()

In [80]:
for column in categorical_columns:
    train_df[column] = label_encoder.fit_transform(df[column])

In [81]:
train_df.dtypes

age                             int64
workclass                       int32
fnlwgt                          int64
education                       int32
education.num                   int64
marital.status                  int32
occupation                      int32
relationship                    int32
race                            int32
sex                             int32
hours.per.week                  int64
native.country                  int32
income                          int32
new_column                      int64
hours.per.week/education.num    int64
age/education.num               int64
dtype: object

In [82]:
X = train_df.drop('income', axis=1)
y = train_df['income']

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [85]:
model = LinearRegression()

# Step 7: Train the model on the training data
model.fit(X_train, y_train)

# Step 8: Make predictions on the testing data
y_pred = model.predict(X_test)

# Step 9: Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 0.11780506638000192
R-squared (R2): 0.3686038951019539


In [86]:
from sklearn.ensemble import RandomForestRegressor

In [88]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Step 7: Train the Random Forest model on the training data
rf_model.fit(X_train, y_train)

# Step 8: Make predictions on the testing data
y_pred_rf = rf_model.predict(X_test)

# Step 9: Evaluate the Random Forest model's performance
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"Random Forest Regression:")
print(f"Mean Squared Error (MSE): {mse_rf}")
print(f"R-squared (R2): {r2_rf}")

Random Forest Regression:
Mean Squared Error (MSE): 0.058928033603311725
R-squared (R2): 0.6841652737886977


In [89]:
from sklearn.ensemble import GradientBoostingRegressor

In [90]:
gb_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, random_state=42)

# Step 7: Train the Gradient Boosting model on the training data
gb_model.fit(X_train, y_train)

# Step 8: Make predictions on the testing data
y_pred_gb = gb_model.predict(X_test)

# Step 9: Evaluate the Gradient Boosting model's performance
mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)

print(f"Gradient Boosting Regression:")
print(f"Mean Squared Error (MSE): {mse_gb}")
print(f"R-squared (R2): {r2_gb}")

Gradient Boosting Regression:
Mean Squared Error (MSE): 0.05399232608907543
R-squared (R2): 0.7106190299400023


In [94]:
def predict_income(age, workclass, fnlwgt, education, education_num, marital_status, occupation,
                   relationship, race, sex, hours_per_week, native_country, label_encoder):
    # Encode categorical features
    workclass_encoded = label_encoder.transform([workclass])[0]
    education_encoded = label_encoder.transform([education])[0]
    marital_status_encoded = label_encoder.transform([marital_status])[0]
    occupation_encoded = label_encoder.transform([occupation])[0]
    relationship_encoded = label_encoder.transform([relationship])[0]
    race_encoded = label_encoder.transform([race])[0]
    sex_encoded = label_encoder.transform([sex])[0]
    native_country_encoded = label_encoder.transform([native_country])[0]

    # Create a DataFrame with the input values
    input_data = pd.DataFrame({
        'age': [age],
        'workclass': [workclass_encoded],
        'fnlwgt': [fnlwgt],
        'education': [education_encoded],
        'education.num': [education_num],
        'marital.status': [marital_status_encoded],
        'occupation': [occupation_encoded],
        'relationship': [relationship_encoded],
        'race': [race_encoded],
        'sex': [sex_encoded],
        'hours.per.week': [hours_per_week],
        'native.country': [native_country_encoded]
    })

    # Make prediction using the Gradient Boosting model
    predicted_income = gb_model.predict(input_data)[0]
    return predicted_income


In [92]:
df

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,hours.per.week,native.country,income,new_column,hours.per.week/education.num,age/education.num
0,40,Self-emp-not-inc,223881,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,70,United-States,>50K,4,4.666667,2.666667
1,30,Private,149118,HS-grad,9,Divorced,Craft-repair,Not-in-family,White,Female,40,United-States,<=50K,-1,4.444444,3.333333
2,46,Private,109209,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,40,United-States,>50K,-1,4.000000,4.600000
3,32,Private,229566,Assoc-voc,11,Married-civ-spouse,Other-service,Husband,White,Male,60,United-States,>50K,4,5.454545,2.909091
5,63,Private,111963,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,White,Male,16,United-States,<=50K,0,1.600000,6.300000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24994,70,Private,278139,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,40,United-States,<=50K,-1,4.444444,7.777778
24995,40,Private,130834,Some-college,10,Never-married,Adm-clerical,Not-in-family,White,Female,40,United-States,<=50K,-1,4.000000,4.000000
24996,31,Local-gov,33124,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,50,United-States,<=50K,1,3.846154,2.384615
24998,23,Private,420973,Bachelors,13,Never-married,Prof-specialty,Not-in-family,White,Female,40,United-States,<=50K,-1,3.076923,1.769231
