<a href="https://www.kaggle.com/code/arunjangir245/titanic-machine-learning-logistic-regression?scriptVersionId=144194299" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

<div style="text-align: center; background-color: #ccffcc; color: #006600; padding: 20px; border-radius: 5px;">
    <h2 style="margin: 0; font-size: 13px;">Don't forget to upvote if you liked the notebook</h2>
</div>


## Import Packages

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.impute import KNNImputer

## Loading the data

In [None]:
train_data=pd.read_csv("/kaggle/input/titanic/train.csv")
test_data=pd.read_csv("/kaggle/input/titanic/test.csv")

In [None]:
train_data.head()

In [None]:
train_data.shape

In [None]:
train_data.info()

In [None]:
train_data['Survived'].value_counts()

## Target Variable (Dependent Variable):
The target variable, also known as the dependent variable or objective variable, is the variable that you want to predict or explain. It is the outcome or result that you are interested in understanding or forecasting. 
Survived or Not ( consider 0 is not Survived & 1 is Survived)

## Independent Variables:
Independent variables, also known as predictor variables or features, are the variables that are used to explain or predict the target variable. They are the inputs to your prediction model and are assumed to influence or have a relationship with the target variable.
Other variables such as Pclass, age etc.

In [None]:
train_data['Survived'].value_counts().keys()

# Here this shows the categorical value which '0' and '1' , so if we want only categories we use .keys() method

 Here this shows the categorical value which '0' and '1' , so if we want only categories we use .keys() method

## Visualization Using Matplotlib

In [None]:
plt.figure(figsize=(5,5))

plt.bar(list(train_data["Survived"].value_counts().keys()),list(train_data["Survived"].value_counts()),color=["r","g"])

plt.show()

## Pclass - Ticket class 1st , 2nd and 3rd

In [None]:
train_data['Pclass'].value_counts()

In [None]:
train_data['Pclass'].value_counts().keys()

In [None]:
plt.figure(figsize=(5,5))

plt.bar(list(train_data["Pclass"].value_counts().keys()),list(train_data["Pclass"].value_counts()),color=["blue","orange","green"])

plt.show()

In [None]:
train_data['Sex'].value_counts()

In [None]:
train_data['Sex'].value_counts().keys()

In [None]:
plt.figure(figsize=(5,5))

plt.bar(list(train_data["Sex"].value_counts().keys()),list(train_data["Sex"].value_counts()),color=["orange","green"])

plt.show()

In [None]:
plt.figure(figsize=(5,7))
plt.hist(train_data["Age"],color="grey")
plt.title("Distribution of Age")
plt.xlabel("Age")
plt.show()

## Data Visualization using Seaborn

In [None]:
sns.countplot(x="Survived",data=train_data)

In [None]:
sns.histplot(y=train_data["Age"],hue=train_data["Survived"],multiple="stack")

In [None]:
sns.barplot(x='Sex',y='Age',data=train_data)

#sns.barplot(x='category_variable', y='numeric_variable', data=data)

In [None]:

sns.barplot(x='Survived',y='Age',data=train_data)

In [None]:
sns.stripplot(x='Pclass', y='Age', data=train_data)

In [None]:
train_data.describe()

## Preprocess the training data

In [None]:
train_data.drop(['PassengerId', 'Name', 'Ticket', 'Embarked'], axis=1, inplace=True)
train_data['Sex'] = LabelEncoder().fit_transform(train_data['Sex'])

## Preprocess the test data

In [None]:
test_data.drop(['PassengerId', 'Name', 'Ticket', 'Embarked'], axis=1, inplace=True)
test_data['Sex'] = LabelEncoder().fit_transform(test_data['Sex'])

## Extract the first letter of the 'Cabin' column and encode it using one-hot encoding

In [None]:
train_data['Cabin'] = train_data['Cabin'].str[0]
train_data = pd.get_dummies(train_data, columns=['Cabin'])

In [None]:
train_data.info()

## Handling Missing values of Training Dataset

In [None]:
imputer = KNNImputer(n_neighbors=5)
train_data[['Age', 'Fare']] = imputer.fit_transform(train_data[['Age', 'Fare']])



## Handling Missing values of Test Dataset

In [None]:

test_data[['Age', 'Fare']] = imputer.fit_transform(test_data[['Age', 'Fare']])

## Align the columns in the test data with the training data

In [None]:
test_data['Cabin'] = test_data['Cabin'].str[0]
test_data = pd.get_dummies(test_data, columns=['Cabin'])
missing_columns = set(train_data.columns) - set(test_data.columns)
for column in missing_columns:
    test_data[column] = 0

In [None]:
train_data.info()

## Building Dependent and Independent variable

In [None]:
X_train = train_data.drop('Survived', axis=1)
y_train = train_data['Survived']

## Building model  (LogisticRegression)

## Scale the numerical features

In [None]:
scaler = StandardScaler()
train_data[['Age', 'Fare']] = scaler.fit_transform(train_data[['Age', 'Fare']])
test_data[['Age', 'Fare']] = scaler.transform(test_data[['Age', 'Fare']])

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [None]:
model = LogisticRegression(max_iter=1000)

In [None]:
model.fit(X_train, y_train)

## Predicting values on training data

In [None]:
y_train_pred = model.predict(X_train)

## Calculate accuracy on the training data

In [None]:
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Accuracy:", train_accuracy)

## Predicting values on test data

In [None]:
test_data.drop('Survived', axis=1, inplace=True)

In [None]:
predictions = model.predict(test_data)

In [None]:
passenger_ids = pd.read_csv('/kaggle/input/titanic/test.csv')['PassengerId']
output = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': predictions})

In [None]:
print(output)