# Assignment 6 - 24/01/2024

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

### 1. Data Preparation: 

In [2]:
file_path = "C:/Users/belki/OneDrive - marmara.edu.tr/Masaüstü/Doktora/Advanced Data Analysis/Datasets/UCI Adult/adult.data" 
# get data
column_names = [
    "age", "work class", "final weight", "education level", "education years", 
    "marital status", "occupation", "relationship", "race", "sex", 
    "capital gain", "capital loss", "working hours per week", "native country", "income"
]
# rename variables

df = pd.read_csv(file_path, header=None, names=column_names, skipinitialspace=True) # skipinitialspace=True -> help to clean unnecessary spaces

print(df.head())
# check data if it works

df.replace('?', np.nan, inplace=True) # I realized that missings are coded as ? so I remove them and make real NaNs.

print(df.isnull().sum()) 
# check missings

categorical_columns = df.select_dtypes(include=['object']).columns
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns
# recode categorical to numeric

scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
# normalization

df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})
# turn target variable into a binary one

X = df.drop('income', axis=1)  # set predictors except target
y = df['income']  # and target variable
X = pd.get_dummies(X, drop_first=True) # transform catrgoric variables

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=45)
# split data as test and train ( 70% for training, 30% for test)


   age        work class  final weight education level  education years  \
0   39         State-gov         77516       Bachelors               13   
1   50  Self-emp-not-inc         83311       Bachelors               13   
2   38           Private        215646         HS-grad                9   
3   53           Private        234721            11th                7   
4   28           Private        338409       Bachelors               13   

       marital status         occupation   relationship   race     sex  \
0       Never-married       Adm-clerical  Not-in-family  White    Male   
1  Married-civ-spouse    Exec-managerial        Husband  White    Male   
2            Divorced  Handlers-cleaners  Not-in-family  White    Male   
3  Married-civ-spouse  Handlers-cleaners        Husband  Black    Male   
4  Married-civ-spouse     Prof-specialty           Wife  Black  Female   

   capital gain  capital loss  working hours per week native country income  
0          2174           

### 2. Model Implementation: 

##### Logistic Regression Model

In [61]:
log_reg = LogisticRegression(max_iter=1000, random_state=45)
log_reg.fit(X_train, y_train)
# model training

y_pred_logreg = log_reg.predict(X_test)
#prediction 

y_train_pred = log_reg.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training performance (Logistic Regression):\n", classification_report(y_train, y_train_pred))
#check the performance of training

# Test seti performansı
y_test_pred = log_reg.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test performance (Logistic Regression):\n", classification_report(y_test, y_test_pred))
#check the performance of test

Training performance (Logistic Regression):
               precision    recall  f1-score   support

           0       0.88      0.93      0.91     17305
           1       0.74      0.60      0.66      5487

    accuracy                           0.85     22792
   macro avg       0.81      0.77      0.78     22792
weighted avg       0.85      0.85      0.85     22792

Test performance (Logistic Regression):
               precision    recall  f1-score   support

           0       0.88      0.94      0.91      7415
           1       0.74      0.59      0.66      2354

    accuracy                           0.85      9769
   macro avg       0.81      0.76      0.78      9769
weighted avg       0.84      0.85      0.85      9769



I print a classification report for the test set of the model. To check if there is an overfitting problem in the model, I also generate a report for the training set and compare the values. 

The accuracy value of the model is 0.85, which shows that the model correctly classified 85% of the test data. When we look at the class differences, the model is correct in 88% of its predictions for the income class of 50k and below (precision: 0.88). In addition, it really predicted 94% of the income class of 50k and below (recall: 0.94). On the other hand, the model is correct in 74% of its predictions for the income class of over 50k (precision: 0.74). Also, it predicted 59% of the income class of over 50k (recall: 0.59). The low precision and recall values ​​for the class above 50k caused the F1 score of this class (.066) to be lower than the class below 50k (0.91).

Finally, the very small difference in training accuracy (85.21%) and test accuracy (85.16%) shows that there is no overfitting problem in the model.

In [48]:
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}
grid = GridSearchCV(LogisticRegression(max_iter=1000, random_state=45), param_grid, cv=5, scoring='f1_weighted')
grid.fit(X_train, y_train)
# optimize C hyperparameter

print("Which one is better?", grid.best_params_)
# see which one is better

Which one is better? {'C': 0.1}


In [56]:
best_C = grid.best_params_['C']
optimized_log_reg = LogisticRegression(C=best_C, max_iter=1000, random_state=45)
optimized_log_reg.fit(X_train, y_train)
y_test_pred = optimized_log_reg.predict(X_test)
# run the test again with the best C parameter

print("Optimized test performance:\n", classification_report(y_test, y_test_pred))
# see the results

Optimized test performance:
               precision    recall  f1-score   support

           0       0.87      0.94      0.91      7415
           1       0.75      0.58      0.65      2354

    accuracy                           0.85      9769
   macro avg       0.81      0.76      0.78      9769
weighted avg       0.84      0.85      0.84      9769



After optimizing the C hyperparameter of logistic regression, for 50k and above, precision (0.75) increases slightly, while recall (0.58) and F1 score (0.65) decrease slightly. The optimization seems to have not worked very well :(

##### Decision Tree Classifier

In [71]:
dt_model = DecisionTreeClassifier(random_state=45)
dt_model.fit(X_train, y_train) 
# train model

y_train_pred = dt_model.predict(X_train)
y_test_pred = dt_model.predict(X_test)
# predict training and test sets

print("Training performance (Decision Tree):\n", classification_report(y_train, y_train_pred))
print("Test performance (Decision Tree):\n", classification_report(y_test, y_test_pred))
# check the performances

Training performance (Decision Tree):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     17305
           1       1.00      1.00      1.00      5487

    accuracy                           1.00     22792
   macro avg       1.00      1.00      1.00     22792
weighted avg       1.00      1.00      1.00     22792

Test performance (Decision Tree):
               precision    recall  f1-score   support

           0       0.88      0.88      0.88      7415
           1       0.62      0.61      0.62      2354

    accuracy                           0.82      9769
   macro avg       0.75      0.75      0.75      9769
weighted avg       0.82      0.82      0.82      9769



I print a classification report for both the training and test sets of the Decision tree model to evaluate its performance and detect potential overfitting.

The training set accuracy is 1.00, indicating that the model perfectly classified 100% of the training data. The precision, recall, and F1-score values for both classes (50k and below and over 50K) are also 1.00, meaning the model memorized the training data entirely. 

On the test set, the accuracy drops to 0.82, meaning the model correctly classified 82% of the test data. When we examine the class-wise performance we see that income class of 50k and below, the precision, recall, and F1-score values are 0.88, indicating consistent performance in correctly identifying this majority class. For the income class of over 50k, the precision is 0.62, meaning the model is correct in 62% of its predictions. The recall is 0.61, showing that it correctly identified 61% of the actual over 50k. These relatively low precision and recall values for the minority class resulted in an F1-score of 0.62, significantly lower than the F1-score for the 50k and below (0.88).

The difference between the training accuracy (100%) and test accuracy (82%) highlights that the model is overfitting the training data. 

In [74]:
dt_model = DecisionTreeClassifier(
    max_depth=5,              
    min_samples_split=15,     
    min_samples_leaf=5,       
    random_state=45
)
# some hyperparameter trials

dt_model.fit(X_train, y_train) 
# train again

y_train_pred_opt = dt_model.predict(X_train)
y_test_pred_opt = dt_model.predict(X_test)
# and performances


print("Optimized training performance (Decision Tree):\n", classification_report(y_train, y_train_pred_opt))
print("Optimized training performance (Decision Tree):\n", classification_report(y_test, y_test_pred_opt))

Optimized training performance (Decision Tree):
               precision    recall  f1-score   support

           0       0.86      0.96      0.91     17305
           1       0.79      0.51      0.62      5487

    accuracy                           0.85     22792
   macro avg       0.82      0.73      0.76     22792
weighted avg       0.84      0.85      0.84     22792

Optimized training performance (Decision Tree):
               precision    recall  f1-score   support

           0       0.85      0.96      0.90      7415
           1       0.78      0.49      0.60      2354

    accuracy                           0.84      9769
   macro avg       0.82      0.72      0.75      9769
weighted avg       0.84      0.84      0.83      9769



After applying hyperparameter tuning to the Decision tree model, I checked the result if the tuning worked. The optimized model shows clear improvements in terms of generalization and reduced overfitting.

The training set accuracy of the optimized model is 0.85, compared to the previous overfitted model's perfect score of 1.00. This indicates that the model no longer memorizes the training data but instead learns more general patterns. For the income class 50k and below, the model achieves a precision of 0.86, a recall of 0.96, and an F1 score of 0.91, showing strong and consistent performance. However, for the income class of over 50k, the model’s performance drops, with a precision of 0.79, a recall of 0.51, and an F1 score of 0.62. 

On the test set, the optimized model achieves an accuracy of 0.84, only slightly lower than its training accuracy, indicating that the model generalizes well. For the 50k and below class, the precision (0.85), recall (0.96), and F1 score (0.90) values remain high, confirming the model’s ability to consistently classify the majority class. For the over 50k class, the precision is 0.78, the recall is 0.49, and the F1 score is 0.60, which aligns closely with the training results and confirms that overfitting has been reduced.

#####  Random Forest

In [4]:
rf_model = RandomForestClassifier(random_state=45, n_estimators=100)  
# set the model with 100 trees
rf_model.fit(X_train, y_train)  
#train the model

y_train_pred_forest = rf_model.predict(X_train)
y_test_pred_forest = rf_model.predict(X_test)
# and predictions

print("Training performance (Random Forest):\n", classification_report(y_train, y_train_pred_forest))
print("Test performance (Random Forest):\n", classification_report(y_test, y_test_pred_forest))
# now performances

Training performance (Random Forest):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     17305
           1       1.00      1.00      1.00      5487

    accuracy                           1.00     22792
   macro avg       1.00      1.00      1.00     22792
weighted avg       1.00      1.00      1.00     22792

Test performance (Random Forest):
               precision    recall  f1-score   support

           0       0.89      0.93      0.91      7415
           1       0.74      0.62      0.68      2354

    accuracy                           0.86      9769
   macro avg       0.81      0.78      0.79      9769
weighted avg       0.85      0.86      0.85      9769



The Random forest model achieves again perfect performance on the training set, with an accuracy of 1.00 and precision, recall, and F1 scores of 1.00 for both classes. On the test set, the accuracy drops to 0.86, showing that the model generalizes better than the unoptimized Decision tree. For the income class of 50k and below, the model performs well, with an F1 score of 0.91. However, for the over 50k class, the precision is 0.74, recall is 0.62, and F1 score is 0.68, reflecting a moderate imbalance in the model’s performance across classes. The gap between training and test performance shows overfitting.

In [7]:
param_grid = {
    'n_estimators': [50, 100, 200],         
    'max_depth': [0, 10, 20, 30],        
    'min_samples_split': [2, 5, 10],        
    'min_samples_leaf': [1, 2, 4]            
}
#set parameters

grid = GridSearchCV(RandomForestClassifier(random_state=45), param_grid, cv=5, n_jobs=-1)
grid.fit(X_train, y_train)
# model optimization with gridsearchCV 

print("Which one is better:", grid.best_params_)
# find better parameters 

optimized_rf_model = grid.best_estimator_  # get the best one
y_train_pred_opt_2 = optimized_rf_model.predict(X_train)
y_test_pred_opt_2 = optimized_rf_model.predict(X_test)
#testing the model after optimization

print("Optimized training performance (Random Forest):\n", classification_report(y_train, y_train_pred_opt_2))
print("Optimized training performance (Random Forest):\n", classification_report(y_test, y_test_pred_opt_2))

135 fits failed out of a total of 540.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\belki\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 895, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\belki\anaconda3\Lib\site-packages\sklearn\base.py", line 1467, in wrapper
    estimator._validate_params()
  File "C:\Users\belki\anaconda3\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\belki\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParame

Which one is better: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 50}
Optimized training performance (Random Forest):
               precision    recall  f1-score   support

           0       0.93      0.98      0.95     17305
           1       0.91      0.75      0.82      5487

    accuracy                           0.92     22792
   macro avg       0.92      0.86      0.89     22792
weighted avg       0.92      0.92      0.92     22792

Optimized training performance (Random Forest):
               precision    recall  f1-score   support

           0       0.88      0.95      0.91      7415
           1       0.78      0.60      0.68      2354

    accuracy                           0.86      9769
   macro avg       0.83      0.77      0.79      9769
weighted avg       0.86      0.86      0.86      9769



After optimization, the Random Forest model shows improved generalization, with the training accuracy dropping to 0.92 (from 1.00) and the test accuracy remaining at 0.86, indicating reduced overfitting. The performance for the over 50k class has slightly improved, with an F1 score of 0.68, and class balance is better reflected. The optimized hyperparameters (max_depth:30, n_estimators:50) effectively balance the model's complexity, making it more generalizable.

##### Conclusion

Among the three models, Random forest performs the best overall due to its higher test accuracy (0.86) and balanced performance across classes. While Logistic regression showed no overfitting, its recall for the  over 50k class was lower (0.59) compared to Random Forest (0.62). The Decision tree model overfit heavily and had worse generalization compared to Random forest. When I evaluate all the results together, Random forest provided better learning and prediction for the data.