In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression


In [2]:
# Load train and test data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Drop the unnecessary columns
train_df.drop(['Booking_ID', 'type_of_meal_plan'], axis=1, inplace=True)
test_df.drop(['Booking_ID', 'type_of_meal_plan'], axis=1, inplace=True)

# Encode categorical variables using one-hot encoding
train_df = pd.get_dummies(train_df)
test_df = pd.get_dummies(test_df)

# Separate features and target variable
X_train = train_df.drop('booking_status_Canceled', axis=1)
y_train = train_df['booking_status_Canceled']
X_test = test_df.drop('booking_status_Canceled', axis=1)
y_test = test_df['booking_status_Canceled']

# Create a logistic regression object
logreg = LogisticRegression()

# Fit the model to the training data
logreg.fit(X_train, y_train)

# Predict on the test data
y_pred = logreg.predict(X_test)

# Calculate accuracy on test data
acc_test = accuracy_score(y_test, y_pred)
print("Test Accuracy:", acc_test)

# Calculate accuracy on training data
y_pred_train = logreg.predict(X_train)
acc_train = accuracy_score(y_train, y_pred_train)
print("Training Accuracy:", acc_train)


Test Accuracy: 0.9965541006202618
Training Accuracy: 0.9979669193659545


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [3]:
#save trained logistic regression model using pickle
import pickle

# Save the trained logistic regression model
with open("logreg_model.pkl", "wb") as f:
    pickle.dump(logreg, f)

In [4]:
column_names = list(train_df)
print(column_names)

['no_of_adults', 'no_of_children', 'no_of_weekend_nights', 'no_of_week_nights', 'required_car_parking_space', 'lead_time', 'arrival_year', 'arrival_month', 'arrival_date', 'repeated_guest', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled', 'avg_price_per_room', 'no_of_special_requests', 'room_type_reserved_Room_Type 1', 'room_type_reserved_Room_Type 2', 'room_type_reserved_Room_Type 3', 'room_type_reserved_Room_Type 4', 'room_type_reserved_Room_Type 5', 'room_type_reserved_Room_Type 6', 'room_type_reserved_Room_Type 7', 'market_segment_type_Aviation', 'market_segment_type_Complementary', 'market_segment_type_Corporate', 'market_segment_type_Offline', 'market_segment_type_Online', 'booking_status_Canceled', 'booking_status_Not_Canceled']


In [5]:
# Load the trained logistic regression model
logreg = pickle.load(open("logreg_model.pkl", "rb"))

# Use the model to predict the "booking_status_Canceled" column for the test dataset
predictions = logreg.predict(X_test)

#convert into categorical values
predictions = ['Canceled' if p == 1 else 'Not_Canceled' for p in predictions]

# Save the predictions to a CSV file
output_df = pd.DataFrame({'booking_status_Canceled': predictions})
output_df.to_csv("predictions.csv", index=False, header=False)
