In [1]:
# Import necessary libraries
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [2]:
# Load labeled and unlabeled data
labeled_data = pd.read_csv("/Users/yevrud/redi_school_ML_AI/final_project/data/train.csv")
unlabeled_data = pd.read_csv("/Users/yevrud/redi_school_ML_AI/final_project/data/test.csv")

# Create copies of the datasets
df_train = labeled_data.copy()
df_test = unlabeled_data.copy()


In [3]:
# Drop confidential columns
confidential_cols = ['name', 'email', 'phone-number', 'credit_card', 'id']
df_train.drop(columns=confidential_cols, inplace=True)
df_test.drop(columns=confidential_cols, inplace=True)

In [4]:
# Separate target and features
X = df_train.drop(columns=['is_canceled'])
y = df_train['is_canceled']


In [5]:
# Identify feature types
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [6]:
# Fill missing values
num_imputer = SimpleImputer(strategy='median')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])

cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])


In [7]:
# Apply one-hot encoding to categorical features
X = pd.get_dummies(X, columns=categorical_features, drop_first=True)

In [8]:
# Split the data
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [9]:
# Train logistic regression model
logreg = LogisticRegression(max_iter=2000, random_state=42)
logreg.fit(X_train, y_train)

# Predict on test set and evaluate
y_pred = logreg.predict(X_test)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"F1 Score: {f1:.4f}")
print("Confusion Matrix:")
print(conf_matrix)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


F1 Score: 0.8740
Confusion Matrix:
[[9451  625]
 [1001 5638]]


**Predict on unlabeled data and save predictions**

In [10]:
# Save the IDs before dropping columns
ids = unlabeled_data['id']

# Fill missing values in test data
df_test[numerical_features] = num_imputer.transform(df_test[numerical_features])
df_test[categorical_features] = cat_imputer.transform(df_test[categorical_features])

# Apply same one-hot encoding
df_test = pd.get_dummies(df_test, columns=categorical_features, drop_first=True)

# Align columns with training data
df_test = df_test.reindex(columns=X.columns, fill_value=0)

# Predict
y_unlabeled_pred = logreg.predict(df_test)

# Save to CSV
output = pd.DataFrame({
    "id": ids,
    "is_canceled": y_unlabeled_pred
})
output.to_csv("simple_unlabeled_predictions.csv", index=False)
print("Predictions saved to 'simple_unlabeled_predictions.csv'")

Predictions saved to 'simple_unlabeled_predictions.csv'
