## Linear Regression on `adult.csv`

In [1]:
import pandas as pd

df = pd.read_csv('adult.csv')
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
df.dtypes

age                int64
workclass         object
fnlwgt             int64
education         object
education-num      int64
marital-status    object
occupation        object
relationship      object
race              object
sex               object
capital-gain       int64
capital-loss       int64
hours-per-week     int64
native-country    object
target            object
dtype: object

### Explore the data

In [3]:
df['age'].hist(backend='plotly', title='Age Distribution')

In [4]:
df['target'].value_counts().plot(kind='bar', backend='plotly', title='Target Distribution')

In [5]:
df['workclass'].value_counts().plot(kind='bar', backend='plotly', title='Workclass Distribution')

In [6]:
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', ' ?', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

### Remove the Missing Values

In [7]:
# replace ' ?' with 'Unknown'
df['workclass'] = df['workclass'].replace(' ?', 'Unknown')
df['workclass'].unique()

array([' State-gov', ' Self-emp-not-inc', ' Private', ' Federal-gov',
       ' Local-gov', 'Unknown', ' Self-emp-inc', ' Without-pay',
       ' Never-worked'], dtype=object)

### Splitting features and target label

In [8]:
x = df.drop('target', axis=1)
y = df['target']

print(f"Shape of x = {x.shape} and y = {y.shape}")

Shape of x = (32561, 14) and y = (32561,)


### Splitting training and testing data - 75 % for training 25 % for testing

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.25)

print(f"x_train: {x_train.shape}")
print(f"x_test: {x_test.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test: {y_test.shape}")

x_train: (24420, 14)
x_test: (8141, 14)
y_train: (24420,)
y_test: (8141,)


In [10]:
x_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
28149,43,Private,219307,9th,5,Divorced,Transport-moving,Not-in-family,Black,Male,0,0,40,United-States
30878,39,Private,230356,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States
32242,33,Private,356015,HS-grad,9,Separated,Craft-repair,Not-in-family,Amer-Indian-Eskimo,Male,0,0,35,Hong
22928,17,Unknown,332666,10th,6,Never-married,?,Own-child,White,Female,0,0,4,United-States
2482,18,Private,230875,11th,7,Never-married,Adm-clerical,Not-in-family,White,Male,0,0,40,United-States


In [11]:
x_test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
16411,55,Private,176904,10th,6,Married-civ-spouse,Transport-moving,Husband,White,Male,0,0,50,United-States
16178,56,Private,101436,HS-grad,9,Divorced,Adm-clerical,Other-relative,Amer-Indian-Eskimo,Female,0,0,35,United-States
8667,36,Private,214604,Bachelors,13,Married-civ-spouse,Sales,Husband,White,Male,0,0,42,United-States
32475,22,Private,67234,HS-grad,9,Never-married,Handlers-cleaners,Unmarried,White,Male,0,0,45,United-States
7779,17,Private,287160,11th,7,Never-married,Other-service,Own-child,White,Female,0,0,15,United-States


### Standardize the data

In [12]:
# Column Transformation
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numerical_attributes = x_train.select_dtypes(include=['int64', 'float64']).columns
categorical_attributes = x_train.select_dtypes(include=['object']).columns

ct = ColumnTransformer([
    ('standar_sclaer', StandardScaler(), numerical_attributes),
    ('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_attributes)
]).fit(x_train)

transformed_x_train = ct.transform(x_train)
transformed_x_test = ct.transform(x_test)

transformed_x_train, transformed_x_test

(<Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 341880 stored elements and shape (24420, 108)>,
 <Compressed Sparse Row sparse matrix of dtype 'float64'
 	with 113974 stored elements and shape (8141, 108)>)

### Dimensionality Reduction using PCA

In [13]:
from sklearn.decomposition import PCA

pca = PCA(n_components=2)
pca.fit(transformed_x_train)

x_train_pca = pca.transform(transformed_x_train)
x_test_pca = pca.transform(transformed_x_test)

In [14]:
x_train_pca, x_test_pca

(array([[-0.91695242, -1.47637228],
        [-0.19097434,  1.28210442],
        [-1.03023059, -0.06339946],
        ...,
        [-1.4110567 , -0.53185563],
        [ 0.80690903, -0.4547174 ],
        [ 1.07613514,  0.4101416 ]], shape=(24420, 2)),
 array([[ 0.65880022, -2.0109056 ],
        [-0.16166635, -1.18238412],
        [ 0.82470584,  0.72859655],
        ...,
        [-0.92446571,  0.22673239],
        [-1.77512904,  0.95834461],
        [ 1.03356507, -1.82320902]], shape=(8141, 2)))

### Linear and Logistic Regression

In [15]:
# Import the necessary libraries
from sklearn.linear_model import LinearRegression, LogisticRegression

# Initialize the Linear Regression model
linear_regression = LinearRegression()
logistic_regression = LogisticRegression()

# Fit the model to your training data
linear_regression.fit(transformed_x_train, y_train)
logistic_regression.fit(transformed_x_train, y_train)

# Make predictions using the fitted model
y_test_pred_linear = linear_regression.predict(transformed_x_test)
y_test_pred_logistic = logistic_regression.predict(transformed_x_test)

In [16]:
print(f"Score for Linear Regression: {linear_regression.score(transformed_x_test, y_test)}")
print(f"Score for Logistic Regression: {logistic_regression.score(transformed_x_test, y_test)}")

Score for Linear Regression: 0.370347773304061
Score for Logistic Regression: 0.8554231666871391


In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

mae = mean_absolute_error(y_test, y_test_pred_linear)
mse = mean_squared_error(y_test, y_test_pred_linear)
r2 = r2_score(y_test, y_test_pred_linear)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"R-squared (R²): {r2}")

Mean Absolute Error (MAE): 0.2644938803087496
Mean Squared Error (MSE): 0.11719464438915464
R-squared (R²): 0.370347773304061


In [18]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred_logistic)
conf_matrix = confusion_matrix(y_test, y_test_pred_logistic)
class_report = classification_report(y_test, y_test_pred_logistic)

print(f"Accuracy: {accuracy}")
print(f"Confusion Matrix:\n{conf_matrix}")
print(f"Classification Report:\n{class_report}")

Accuracy: 0.8554231666871391
Confusion Matrix:
[[5730  398]
 [ 779 1234]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      6128
           1       0.76      0.61      0.68      2013

    accuracy                           0.86      8141
   macro avg       0.82      0.77      0.79      8141
weighted avg       0.85      0.86      0.85      8141



### Naive Bayes

In [19]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Initialize the Naive Bayes model
naive_bayes = GaussianNB()

# Fit the model to your training data
naive_bayes.fit(transformed_x_train.toarray(), y_train)

# Make predictions using the fitted model
y_test_pred_naive_bayes = naive_bayes.predict(transformed_x_test.toarray())

In [20]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred_naive_bayes)
conf_matrix = confusion_matrix(y_test, y_test_pred_naive_bayes)
class_report = classification_report(y_test, y_test_pred_naive_bayes)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.5609875936617124
Confusion Matrix:
[[2661 3467]
 [ 107 1906]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.43      0.60      6128
           1       0.35      0.95      0.52      2013

    accuracy                           0.56      8141
   macro avg       0.66      0.69      0.56      8141
weighted avg       0.81      0.56      0.58      8141



### SVM

In [21]:
from sklearn.svm import SVC

# Initialize the SVM classifier
svm_classifier = SVC()

# Fit the classifier to your transformed training data
svm_classifier.fit(transformed_x_train, y_train)

# Make predictions using the fitted classifier
y_test_pred_svm = svm_classifier.predict(transformed_x_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_test_pred_svm)
conf_matrix = confusion_matrix(y_test, y_test_pred_svm)
class_report = classification_report(y_test, y_test_pred_svm)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.8597223928264341
Confusion Matrix:
[[5800  328]
 [ 814 1199]]
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.95      0.91      6128
           1       0.79      0.60      0.68      2013

    accuracy                           0.86      8141
   macro avg       0.83      0.77      0.79      8141
weighted avg       0.85      0.86      0.85      8141



### Comparing the performance of different models

In [22]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize the models
linear_regression = LinearRegression()
logistic_regression = LogisticRegression()
naive_bayes = GaussianNB()
svm_classifier = SVC()
decisition_tree_classifiiers = DecisionTreeClassifier()
knn_classifier = KNeighborsClassifier()

# Fit the models
linear_regression.fit(transformed_x_train, y_train)
logistic_regression.fit(transformed_x_train, y_train)
naive_bayes.fit(transformed_x_train.toarray(), y_train)
svm_classifier.fit(transformed_x_train, y_train)
decisition_tree_classifiiers.fit(transformed_x_train, y_train)
knn_classifier.fit(transformed_x_train, y_train)

# Make predictions
y_test_pred_linear = linear_regression.predict(transformed_x_test)
y_test_pred_logistic = logistic_regression.predict(transformed_x_test)
y_test_pred_naive_bayes = naive_bayes.predict(transformed_x_test.toarray())
y_test_pred_svm = svm_classifier.predict(transformed_x_test)
y_test_pred_decision_tree = decisition_tree_classifiiers.predict(transformed_x_test)
y_test_pred_knn = knn_classifier.predict(transformed_x_test)

y_test_pred_liner_binary = (y_test_pred_linear > 0.5).astype(int)

# Evaluate the models
accuracy_linear = accuracy_score(y_test, y_test_pred_liner_binary)
accuracy_logistic = accuracy_score(y_test, y_test_pred_logistic)
accuracy_naive_bayes = accuracy_score(y_test, y_test_pred_naive_bayes)
accuracy_svm = accuracy_score(y_test, y_test_pred_svm)
accuracy_decision_tree = accuracy_score(y_test, y_test_pred_decision_tree)
accuracy_knn = accuracy_score(y_test, y_test_pred_knn)

In [None]:
# Plot the accuracies
import plotly.express as px

models = [
    "Linear Regression",
    "Logistic Regression",
    "Naive Bayes",
    "SVM",
    "Decision Tree",
    "KNN",
]
accuracies = list(
    map(
        lambda x: round(x * 100, 2),
        [
            accuracy_linear,
            accuracy_logistic,
            accuracy_naive_bayes,
            accuracy_svm,
            accuracy_decision_tree,
            accuracy_knn,
        ],
    )
)

fig = px.bar(
    x=models,
    y=accuracies,
    color=models,
    title="Model Comparison",
)
fig.update_layout(
    xaxis_title="Model",
    yaxis_title="Accuracy",
)
fig.show()