In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix,classification_report,ConfusionMatrixDisplay


In [None]:
df0=pd.read_csv('train.csv')
df=df0.copy()
test_df0=pd.read_csv('test.csv')
test_df=test_df0.copy()

In [None]:
df.head()
df.shape

In [None]:
df.isnull().sum()

In [None]:
df=df.drop(columns=['PassengerId','Name'])

In [None]:
columns_to_impute = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa','VRDeck']


for col in columns_to_impute:
    mean_value = df[col].mean()
    df[col].fillna(mean_value, inplace=True)

In [None]:
columns_to_impute = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
for col in columns_to_impute:
    mode_value = df[col].mode()[0]
    df[col].fillna(mode_value, inplace=True)

In [None]:
df.head()

In [None]:
def extract_deck_side(cabin):
    if isinstance(cabin, str):  # Check if the value is a string
        parts = cabin.split('/')  # Split the string by '/'
        if len(parts) >= 3:  # Ensure that there are at least 3 parts
            return parts[0] + parts[2][0]  # Combine the deck and side information
    return None  # Return None for missing or incorrectly formatted values

# Create a new column in the DataFrame to store the extracted string
df['Deck_Side'] = df['Cabin'].apply(extract_deck_side)



In [None]:
df=df.drop(columns=['Cabin'])


In [None]:
df.head()

In [None]:
df=pd.get_dummies(df, columns=['CryoSleep','HomePlanet','Destination','Transported'], drop_first=True)

In [None]:
df['Deck_Side'].value_counts()

In [None]:
survival_rate = df.groupby('Deck_Side')['Transported_True'].mean()

# Plotting the bar graph
survival_rate.plot(kind='bar', color='skyblue')

# Adding labels and title
plt.title('Transportation Rate by Deck/Side')
plt.xlabel('Deck/Side')
plt.ylabel('Transportation Rate')
plt.xticks(rotation=0)  # Rotate x-axis labels if necessary

# Displaying the plot
plt.tight_layout()
plt.show()

In [None]:
df['Side'] = df['Deck_Side'].str[1]


In [None]:
df.head()

In [None]:
df=df.drop(columns=['Deck_Side'])

In [None]:
df=pd.get_dummies(df, columns=['Side'], drop_first=True)

In [None]:
df['CryoSleep_True'] = df['CryoSleep_True'].astype(int)
df['HomePlanet_Europa'] = df['HomePlanet_Europa'].astype(int)
df['HomePlanet_Mars'] = df['HomePlanet_Mars'].astype(int)
df['Destination_PSO J318.5-22'] = df['Destination_PSO J318.5-22'].astype(int)
df['Destination_TRAPPIST-1e'] = df['Destination_TRAPPIST-1e'].astype(int)
df['Transported_True'] = df['Transported_True'].astype(int)
df['Side_S'] = df['Side_S'].astype(int)

In [None]:
df['VIP'] = df['VIP'].astype(int)

In [None]:
df['TotalSpending'] = df['RoomService'] + df['FoodCourt'] + df['ShoppingMall'] + df['Spa'] + df['VRDeck']

In [None]:
df.head()

In [None]:
average_spending = df.groupby('Transported_True')['TotalSpending'].mean()

# Plotting the bar chart
plt.figure(figsize=(8, 6))
average_spending.plot(kind='bar', color='skyblue')

# Adding labels and title
plt.title('Average Total Spending by Transported')
plt.xlabel('Transported')
plt.ylabel('Average Total Spending')
plt.xticks([0, 1], ['Not Transported', 'Transported'], rotation=0)

In [None]:
df=df.drop(columns=['RoomService',	'FoodCourt',	'ShoppingMall',	'Spa',	'VRDeck'])

In [None]:
df.head()

In [None]:
average_spending = df.groupby('Transported_True')['Age'].mean()

# Plotting the bar chart
plt.figure(figsize=(8, 6))
average_spending.plot(kind='bar', color='skyblue')

# Adding labels and title
plt.title('Age by Transported')
plt.xlabel('Transported')
plt.ylabel('Age')
plt.xticks([0, 1], ['Not Transported', 'Transported'], rotation=0)

In [None]:
y = df['Transported_True']

# Drop the target variable from the DataFrame to obtain the feature matrix X
X = df.drop(columns=['Transported_True'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
X.isnull().sum()

In [None]:
classifiers = {
    'Naive Bayes': GaussianNB(),
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

param_grids = {
    'Naive Bayes': {},
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10]
    },
    'Logistic Regression': {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l2', 'none']
    },
    'Decision Tree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    },
    'Gradient Boosting': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.001, 0.01, 0.1],
        'max_depth': [3, 5, 7]
    }
}
results = {}
for clf_name, clf in classifiers.items():
    grid_search = GridSearchCV(clf, param_grids[clf_name], cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    results[clf_name] = (best_model, grid_search.best_params_)
for clf_name, (best_model, best_params) in results.items():
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{clf_name} - Best Model Accuracy: {accuracy}, Best Parameters: {best_params}")


In [None]:
best_params_dt = results['Random Forest'][1]
best_model_dt = RandomForestClassifier(**best_params_dt)
best_model_dt.fit(X_train, y_train)
y_pred_dt = best_model_dt.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
print("Random Forest - Test Accuracy:", accuracy_dt)

In [None]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred_dt, average='binary')
conf_matrix = confusion_matrix(y_test, y_pred_dt)

In [None]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1_score)
print("Classification Report:")
print(classification_report(y_test, y_pred_dt))
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=[0, 1])
disp.plot(cmap='Blues', values_format='d')
plt.title('Confusion Matrix')
plt.show()

In [None]:
test_df.isnull().sum()

In [None]:
test_df=test_df.drop(columns=['PassengerId','Name'])

In [None]:
columns_to_impute = ['Age', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa','VRDeck']


for col in columns_to_impute:
    mean_value = test_df[col].mean()
    test_df[col].fillna(mean_value, inplace=True)


columns_to_impute = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP']
for col in columns_to_impute:
    mode_value = test_df[col].mode()[0]
    test_df[col].fillna(mode_value, inplace=True)

In [None]:
def extract_deck_side(cabin):
    if isinstance(cabin, str):  # Check if the value is a string
        parts = cabin.split('/')  # Split the string by '/'
        if len(parts) >= 3:  # Ensure that there are at least 3 parts
            return parts[0] + parts[2][0]  # Combine the deck and side information
    return None  # Return None for missing or incorrectly formatted values

# Create a new column in the DataFrame to store the extracted string
test_df['Deck_Side'] = test_df['Cabin'].apply(extract_deck_side)


In [None]:
test_df=test_df.drop(columns=['Cabin'])


In [None]:
test_df=pd.get_dummies(test_df, columns=['CryoSleep','HomePlanet','Destination'], drop_first=True)

In [None]:
test_df.head()

In [None]:
test_df['Side'] = test_df['Deck_Side'].str[1]

In [None]:
test_df=test_df.drop(columns=['Deck_Side'])

In [None]:
test_df=pd.get_dummies(test_df, columns=['Side'], drop_first=True)

In [None]:
test_df['CryoSleep_True'] = test_df['CryoSleep_True'].astype(int)
test_df['HomePlanet_Europa'] = test_df['HomePlanet_Europa'].astype(int)
test_df['HomePlanet_Mars'] = test_df['HomePlanet_Mars'].astype(int)
test_df['Destination_PSO J318.5-22'] = test_df['Destination_PSO J318.5-22'].astype(int)
test_df['Destination_TRAPPIST-1e'] = test_df['Destination_TRAPPIST-1e'].astype(int)
test_df['Side_S'] = test_df['Side_S'].astype(int)

In [None]:
test_df['VIP'] = test_df['VIP'].astype(int)

In [None]:
test_df['TotalSpending'] = test_df['RoomService'] + test_df['FoodCourt'] + test_df['ShoppingMall'] + test_df['Spa'] + test_df['VRDeck']


In [None]:
test_df=test_df.drop(columns=['RoomService',	'FoodCourt',	'ShoppingMall',	'Spa',	'VRDeck'])

In [None]:
test_df.head()

In [None]:

survival_probabilities = best_model_dt.predict_proba(test_df)[:, 1]
threshold = 0.5
predictions = (survival_probabilities > threshold)

# Convert predictions to boolean values
predictions = predictions.astype(bool)

submission_df = pd.DataFrame({
    'PassengerId': test_df0['PassengerId'],
    'Transported': predictions
})
submission_df.to_csv('submission.csv', index=False)