In [1]:
import pandas as pd

In [28]:
train_data = pd.read_csv("train.csv")

In [29]:
train_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,53,technician,married,tertiary,no,395,no,no,cellular,3,jun,107,1,-1,0,unknown,no
39996,30,management,single,tertiary,no,3340,no,no,cellular,3,jun,238,3,-1,0,unknown,yes
39997,54,admin.,divorced,secondary,no,200,no,no,cellular,3,jun,170,1,-1,0,unknown,yes
39998,34,management,married,tertiary,no,1047,no,no,cellular,3,jun,342,1,90,7,other,no


In [30]:
train_data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [31]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        40000 non-null  int64 
 1   job        40000 non-null  object
 2   marital    40000 non-null  object
 3   education  40000 non-null  object
 4   default    40000 non-null  object
 5   balance    40000 non-null  int64 
 6   housing    40000 non-null  object
 7   loan       40000 non-null  object
 8   contact    40000 non-null  object
 9   day        40000 non-null  int64 
 10  month      40000 non-null  object
 11  duration   40000 non-null  int64 
 12  campaign   40000 non-null  int64 
 13  pdays      40000 non-null  int64 
 14  previous   40000 non-null  int64 
 15  poutcome   40000 non-null  object
 16  y          40000 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.2+ MB


In [32]:
test_data = pd.read_csv("test.csv")

In [33]:
test_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,57,retired,married,secondary,no,718,no,no,cellular,3,jun,170,1,-1,0,unknown
1,50,management,married,secondary,no,268,no,no,cellular,3,jun,74,2,90,2,failure
2,46,technician,single,secondary,no,4367,yes,no,cellular,3,jun,65,1,96,1,failure
3,56,admin.,married,secondary,no,3554,no,no,cellular,3,jun,151,1,-1,0,unknown
4,32,services,single,tertiary,no,785,no,no,cellular,3,jun,273,1,-1,0,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown
5207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown
5208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success
5209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown


In [34]:
test_data["job"].value_counts().unique

<bound method Series.unique of job
management       1292
retired           827
technician        745
admin.            688
student           414
blue-collar       349
services          244
unemployed        199
self-employed     165
housemaid         153
entrepreneur       82
unknown            53
Name: count, dtype: int64>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Data Preprocessing and feature enginnering

In [None]:
# 1. Encode categorical variables using LabelEncoder
categorical_columns = train_data.select_dtypes(include=['object']).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    label_encoders[col] = le

# 2. Separate features (X) and target variable (y)
X = train_data.drop(columns=['y'])
y = train_data['y']

# 3. Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 4. Normalize numeric features
scaler = StandardScaler()
numeric_columns = X.select_dtypes(include=['float64', 'int64']).columns
X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
X_val[numeric_columns] = scaler.transform(X_val[numeric_columns])

# Preprocessing is complete
X_train.shape, X_val.shape, y_train.shape, y_val.shape

In [35]:
# Perform one-hot encoding for categorical variables
train_encoded = pd.get_dummies(train_data, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome'], drop_first=True)

# Ensure the same columns exist in the test dataset
test_encoded = pd.get_dummies(test_data, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome'], drop_first=True)

# Align the columns in train and test data to ensure consistency
train_encoded, test_encoded = train_encoded.align(test_encoded, join='inner', axis=1)

# Adding back the target variable 'y' to train_encoded
train_encoded['y'] = train_data['y'].apply(lambda x: 1 if x == 'yes' else 0)

# Check the processed data
train_encoded_overview = train_encoded.head()
train_encoded_overview


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_blue-collar,job_entrepreneur,job_housemaid,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,poutcome_other,poutcome_success,poutcome_unknown,y
0,58,2143,5,261,1,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,True,0
1,44,29,5,151,1,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,True,0
2,33,2,5,76,1,-1,0,False,True,False,...,False,False,False,True,False,False,False,False,True,0
3,47,1506,5,92,1,-1,0,True,False,False,...,False,False,False,True,False,False,False,False,True,0
4,33,1,5,198,1,-1,0,False,False,False,...,False,False,False,True,False,False,False,False,True,0


In [82]:
print(train_encoded.dtypes)

age                    int64
balance                int64
day                    int64
duration               int64
campaign               int64
pdays                  int64
previous               int64
job_blue-collar         bool
job_entrepreneur        bool
job_housemaid           bool
job_management          bool
job_retired             bool
job_self-employed       bool
job_services            bool
job_student             bool
job_technician          bool
job_unemployed          bool
job_unknown             bool
marital_married         bool
marital_single          bool
education_secondary     bool
education_tertiary      bool
education_unknown       bool
default_yes             bool
housing_yes             bool
loan_yes                bool
contact_telephone       bool
contact_unknown         bool
month_aug               bool
month_dec               bool
month_feb               bool
month_jan               bool
month_jul               bool
month_jun               bool
month_mar     

In [83]:
print(test_encoded.dtypes)

age                    int64
balance                int64
day                    int64
duration               int64
campaign               int64
pdays                  int64
previous               int64
job_blue-collar         bool
job_entrepreneur        bool
job_housemaid           bool
job_management          bool
job_retired             bool
job_self-employed       bool
job_services            bool
job_student             bool
job_technician          bool
job_unemployed          bool
job_unknown             bool
marital_married         bool
marital_single          bool
education_secondary     bool
education_tertiary      bool
education_unknown       bool
default_yes             bool
housing_yes             bool
loan_yes                bool
contact_telephone       bool
contact_unknown         bool
month_aug               bool
month_dec               bool
month_feb               bool
month_jan               bool
month_jul               bool
month_jun               bool
month_mar     

In [69]:
from sklearn.metrics import classification_report


In [84]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Split the data into features (X) and target (y)
X = train_encoded.drop(columns=['y'])
y = train_encoded['y']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train a Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predict on validation set
y_pred = logreg.predict(X_val)

# Calculate accuracy and F1 score
accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)

accuracy, f1, conf_matrix

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(0.93425,
 0.34577114427860695,
 array([[7335,   89],
        [ 437,  139]], dtype=int64))

In [85]:
print("\nClassification Report (Balanced Model):\n", class_report_balanced)


Classification Report (Balanced Model):
               precision    recall  f1-score   support

           0       0.94      0.99      0.97      7424
           1       0.63      0.23      0.34       576

    accuracy                           0.93      8000
   macro avg       0.79      0.61      0.65      8000
weighted avg       0.92      0.93      0.92      8000



In [74]:
# Import necessary libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the Random Forest model
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

# Train the model using the training set
rf_model.fit(X_train, y_train)

# Predict the target variable on the validation set
y_pred = rf_model.predict(X_val)

# Evaluate model performance
accuracy = accuracy_score(y_val, y_pred)
conf_matrix = confusion_matrix(y_val, y_pred)
class_report = classification_report(y_val, y_pred)

# Output the results
print("Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)


Accuracy: 0.93975

Confusion Matrix:
 [[7313  108]
 [ 374  205]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97      7421
           1       0.65      0.35      0.46       579

    accuracy                           0.94      8000
   macro avg       0.80      0.67      0.71      8000
weighted avg       0.93      0.94      0.93      8000



In [76]:
pip install imbalanced-learn


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [81]:
from sklearn.model_selection import RandomizedSearchCV

# Hyperparameter grid
param_grid_rf = {
    'n_estimators': [200, 300, 400],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', 'balanced_subsample']
}

# RandomizedSearchCV with pipeline
grid_rf = RandomizedSearchCV(estimator=RandomForestClassifier(random_state=42),
                             param_distributions=param_grid_rf,
                             scoring='f1',
                             cv=3,
                             n_jobs=-1,
                             verbose=2)

# Fit and evaluate
pipeline_tuned = Pipeline([('smote', smote), ('grid_rf', grid_rf)])
pipeline_tuned.fit(X_train, y_train)

y_pred_tuned = pipeline_tuned.predict(X_val)
f1_tuned = f1_score(y_val, y_pred_tuned)

print("F1 Score after tuning:", f1_tuned)


Fitting 3 folds for each of 10 candidates, totalling 30 fits
F1 Score after tuning: 0.537714712471994


## Hyperparameter Tuning

In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

# Define the parameter grid for tuning
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': [None, 'balanced']
}

# Initialize GridSearchCV with RandomForestClassifier
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,  # 3-fold cross-validation
                           scoring='f1',  # We want to optimize for F1 score
                           n_jobs=-1,  # Use all available cores for faster processing
                           verbose=2)

# Fit grid search on the training data
grid_search.fit(X_train, y_train)

# Best parameters from grid search
best_params = grid_search.best_params_

# Best model after hyperparameter tuning
best_rf_model = grid_search.best_estimator_

print(f"Best Hyperparameters: {best_params}")


Fitting 3 folds for each of 216 candidates, totalling 648 fits
Best Hyperparameters: {'class_weight': 'balanced', 'max_depth': 20, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300}


In [41]:
# Train the Random Forest model with the best hyperparameters
best_rf_model = RandomForestClassifier(
    class_weight='balanced',
    max_depth=20,
    min_samples_leaf=2,
    min_samples_split=10,
    n_estimators=300,
    random_state=42
)

# Fit the model to the training data
best_rf_model.fit(X_train, y_train)

# Make predictions on the validation set (X_val)
y_pred_best = best_rf_model.predict(X_val)

# Evaluate the model
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

# Accuracy
accuracy_best = accuracy_score(y_val, y_pred_best)
# F1 Score
f1_score_best = f1_score(y_val, y_pred_best)
# Confusion Matrix
conf_matrix_best = confusion_matrix(y_val, y_pred_best)

accuracy_best, f1_score_best, conf_matrix_best


(0.922,
 0.5800807537012113,
 array([[6945,  479],
        [ 145,  431]], dtype=int64))

In [50]:
from imblearn.over_sampling import ADASYN

# Apply ADASYN for oversampling
adasyn = ADASYN(random_state=42)
X_train_adasyn, y_train_adasyn = adasyn.fit_resample(X_train, y_train)

# Train the Random Forest on the resampled data
best_rf_model.fit(X_train_adasyn, y_train_adasyn)
y_pred_best = best_rf_model.predict(X_val)

# Evaluate the F1 score
f1_score_best = f1_score(y_val, y_pred_best)
print("F1 score with ADASYN and RandomForest:", f1_score_best)


F1 score with ADASYN and RandomForest: 0.5083487940630798


In [79]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

# Initialize SMOTE and Random Forest model
smote = SMOTE(random_state=42)
rf_model_smote = RandomForestClassifier(
    n_estimators=300,
    max_depth=20,
    min_samples_split=10,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# Create a pipeline with SMOTE and Random Forest model
pipeline = Pipeline([('smote', smote), ('rf', rf_model_smote)])

# Train the model using the training set
pipeline.fit(X_train, y_train)

# Predict the target variable on the validation set
y_pred_smote = pipeline.predict(X_val)

# Evaluate model performance
accuracy_smote = accuracy_score(y_val, y_pred_smote)
conf_matrix_smote = confusion_matrix(y_val, y_pred_smote)
class_report_smote = classification_report(y_val, y_pred_smote)

# Output the results
print("Accuracy (SMOTE Model):", accuracy_smote)
print("\nConfusion Matrix (SMOTE Model):\n", conf_matrix_smote)
print("\nClassification Report (SMOTE Model):\n", class_report_smote)


Accuracy (SMOTE Model): 0.9145

Confusion Matrix (SMOTE Model):
 [[6915  506]
 [ 178  401]]

Classification Report (SMOTE Model):
               precision    recall  f1-score   support

           0       0.97      0.93      0.95      7421
           1       0.44      0.69      0.54       579

    accuracy                           0.91      8000
   macro avg       0.71      0.81      0.75      8000
weighted avg       0.94      0.91      0.92      8000



In [51]:
from imblearn.combine import SMOTETomek

# Apply SMOTE + Tomek Links
smote_tomek = SMOTETomek(random_state=42)
X_train_resampled, y_train_resampled = smote_tomek.fit_resample(X_train, y_train)

# Train the Random Forest model on the resampled data
best_rf_model.fit(X_train_resampled, y_train_resampled)
y_pred_resampled = best_rf_model.predict(X_val)

# Evaluate the F1 score with SMOTE + Tomek Links
f1_score_resampled = f1_score(y_val, y_pred_resampled)
print("F1 score with SMOTE + Tomek:", f1_score_resampled)


F1 score with SMOTE + Tomek: 0.5434027777777778


# amongst all randomforest classifier performs better in training and testing data , so we are using it for the prediction

## Prediction

In [65]:
# Encode the test dataset (same steps as for the training data)
test_encoded = pd.get_dummies(test_data, columns=['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome'], drop_first=True)

# Align test data to match the training data's columns (ensure consistency)
test_encoded = test_encoded.reindex(columns=X.columns, fill_value=0)


In [66]:
# Make predictions on the test dataset
test_predictions = best_rf_model.predict(test_encoded)

# Convert predictions to a DataFrame for easy submission
test_predictions_df = pd.DataFrame(test_predictions, columns=['y'])
test_predictions_df['y'] = test_predictions_df['y'].apply(lambda x: 'yes' if x == 1 else 'no')


In [67]:

#  test predictions are in a DataFrame named `test_data`
# Replace 'test_data' with your test DataFrame and 'predicted_y' with your predictions.

# Add the predicted 'y' column to the test DataFrame
test_data['y'] = test_predictions_df['y']  # Replace test_predictions with your prediction  DataFrame column

# Save the DataFrame to CSV in the required format
key_code = "5995"  # Replace this with your actual Key Code
output_path = f"C:/Users/krush/OneDrive/Desktop/New folder (3)/{key_code}.CSV"

test_data.to_csv(output_path, index=False)

print(f"Prediction file saved successfully as {output_path}")


Prediction file saved successfully as C:/Users/krush/OneDrive/Desktop/New folder (3)/5995.CSV
