In [2]:
import numpy as np 
import pandas as pd 

In [4]:
df_train=pd.read_csv('train.csv')
df_test=pd.read_csv('test.csv')
pd.set_option('display.max_columns', None)

In [6]:
from sklearn.impute import SimpleImputer

# Identify categorical and numerical columns
categorical_columns = df_train.select_dtypes(include=['object', 'category']).columns
numerical_columns = df_train.select_dtypes(include=['int64', 'float64']).columns

# Imputers
categorical_imputer = SimpleImputer(strategy='most_frequent')
numerical_imputer = SimpleImputer(strategy='mean')

# Apply imputers to df_train
df_train[categorical_columns] = categorical_imputer.fit_transform(df_train[categorical_columns])
df_train[numerical_columns] = numerical_imputer.fit_transform(df_train[numerical_columns])


In [8]:
from sklearn.preprocessing import LabelEncoder

# Apply Label Encoding to all categorical columns
label_encoders = {}

for col in categorical_columns:
    label_encoders[col] = LabelEncoder()
    df_train[col] = label_encoders[col].fit_transform(df_train[col])


In [11]:
from sklearn.model_selection import train_test_split

# Separate features and target variable
X = df_train.drop('target', axis=1)  # Features
y = df_train['target']               # Target variable

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import accuracy_score


In [13]:
# Separate features (X) and target (y)
X = df_train.drop('target', axis=1)  # Features
y = df_train['target']               # Target variable

# Split the data into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# Initialize Decision Tree Classifier with random_state=42
dt_model = DecisionTreeClassifier(random_state=42)


In [15]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


In [16]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Initialize Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Define parameter grid
param_grid = {
    'max_depth': [20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Retrieve best parameters
best_params = grid_search.best_params_
print("Best max_depth:", best_params['max_depth'])


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best max_depth: 20


In [17]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Initialize Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Define parameter grid
param_grid = {
    'max_depth': [20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Retrieve best parameters
best_params = grid_search.best_params_
print("Best min_samples_split:", best_params['min_samples_split'])


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best min_samples_split: 2


In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Initialize Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)

# Define parameter grid
param_grid = {
    'max_depth': [20, 30],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=dt_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)

# Retrieve best parameters
best_params = grid_search.best_params_
print("Best min_samples_leaf:", best_params['min_samples_leaf'])


Fitting 3 folds for each of 8 candidates, totalling 24 fits
Best min_samples_leaf: 2


In [19]:
from sklearn.metrics import accuracy_score

# Retrieve the best estimator from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions on validation set (X_test)
y_pred = best_model.predict(X_test)

# Compute accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print accuracy rounded to 2 decimal places
print("Validation Set Accuracy (rounded to 2 decimals):", round(accuracy, 2))


Validation Set Accuracy (rounded to 2 decimals): 0.57


In [20]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score


In [21]:
# Initialize AdaBoostClassifier
adaboost_model = AdaBoostClassifier(random_state=42)


In [22]:
# Define parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [10, 20, 30],
    'learning_rate': [5, 10],
    'algorithm': ['SAMME']
}


In [25]:
# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=adaboost_model,
    param_grid=param_grid,
    scoring='accuracy',
    cv=3,
    verbose=1  # Set verbose to see progress updates
)

# Fit GridSearchCV on training data
grid_search.fit(X_train, y_train)


Fitting 3 folds for each of 6 candidates, totalling 18 fits




In [26]:
# Retrieve best parameters and best model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Hyperparameters:", best_params)


Best Hyperparameters: {'algorithm': 'SAMME', 'learning_rate': 5, 'n_estimators': 10}


In [27]:
# Predict on validation set (X_test)
y_pred = best_model.predict(X_test)

# Compute accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print accuracy rounded to 2 decimal places
print("Validation Set Accuracy (rounded to 2 decimals):", round(accuracy, 2))


Validation Set Accuracy (rounded to 2 decimals): 0.43


In [28]:
# Retrieve the best parameters from GridSearchCV
best_params = grid_search.best_params_

# Print the best value of n_estimators
print("Best n_estimators:", best_params['n_estimators'])


Best n_estimators: 10


In [29]:
# Retrieve the best parameters from GridSearchCV
best_params = grid_search.best_params_

# Print the best value of learning_rate
print("Best learning_rate:", best_params['learning_rate'])


Best learning_rate: 5


In [30]:
from sklearn.metrics import accuracy_score

# Retrieve the best estimator from GridSearchCV
best_model = grid_search.best_estimator_

# Make predictions on validation set (X_test)
y_pred = best_model.predict(X_test)

# Compute accuracy score
accuracy = accuracy_score(y_test, y_pred)

# Print accuracy rounded to 2 decimal places
print("Validation Set Accuracy (rounded to 2 decimals):", round(accuracy, 2))


Validation Set Accuracy (rounded to 2 decimals): 0.43
