In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Import other Libraries**

In [None]:
#Import other neccessary libraries
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

 # **Load and Explore Dataset**

In [None]:
train = "/kaggle/input/titanic/train.csv" #train data file path
test = "/kaggle/input/titanic/test.csv"  #test data file path

train_data = pd.read_csv(train)    

test_data = pd.read_csv(test)       

**Train Dataset**

In [None]:
#Displaying first few rows of train dataset
print(train_data.head())

In [None]:
#Train_data summary statistics
print(train_data.describe())

In [None]:
#Checking for train_data missing values
print(train_data.isnull().sum())

In [None]:
#visualize train_data missing values
sns.heatmap(train_data.isnull(), cbar=False, cmap='viridis')
plt.show()

In [None]:
sns.boxplot(data=train_data, x='Embarked', y='Pclass')
plt.show

In [None]:
#Visualize distribution of numeric features for train_data
sns.histplot(train_data['Age'].dropna(), kde=False, bins=20)
plt.show()

#Visualize categories features
sns.countplot(data=train_data, x='Survived')
plt.show()

**Test Dataset**

In [None]:
#Displaying first few rows of test dataset
print(test_data.head())

In [None]:
#Checking for test_data missing values
print(test_data.isnull().sum())

In [None]:
#Test data summary statistics
print(test_data.describe())

In [None]:
#visualize missiing test_data
sns.heatmap(test_data.isnull(), cbar=False, cmap='viridis')
plt.show

In [None]:
sns.boxplot(data=test_data, x='Pclass', y='Fare')
plt.show()

In [None]:
#Visualizing test_data age distribution
sns.histplot(test_data['Age'].dropna(),kde=False, bins=20)
plt.show()

# **Cleaning Data**

**Treating Missing Values**

In [None]:
#Train_data
#Impute train_data missing Age values with the mean Age
train_data['Age1'] = train_data['Age'].fillna(train_data['Age'].mean)
train_data['New_Age'] = pd.to_numeric(train_data['Age'], errors='coerce')
train_data['New_Age1'] = train_data['New_Age'].fillna(train_data['New_Age'].mean)
train_data['New_Age1'] = pd.to_numeric(train_data['New_Age1'], errors='coerce')

#Impute train_data missin Embarked value with the mode
train_data['Embarked'].fillna(train_data['Embarked'].mode()[0], inplace=True)
#train_data['Embarked'] = train_data['Embarked'].fillna(train_data['Embarked'].mode(), inplace = True)

#Assigning zero(0) for missing cabin values
train_data['Cabin'].fillna(0, inplace=True)
train_data['Family_Size'] = train_data['SibSp'] + train_data['Parch'] + 1

print(train_data.dtypes)


In [None]:
#Test_data
#Impute test_data missing Age values with mean age 

test_data['Age'].fillna(test_data['Age'].mean, inplace=True)
test_data['New_Age1'] = pd.to_numeric(test_data['Age'], errors='coerce')

#Impute group mean for missing Fare value
group = test_data.loc[(test_data['Pclass'] == 3) & (test_data['Sex'] == 'male') & (test_data['Embarked'] == 'S') & (test_data['SibSp'] == 0) & (test_data['Parch'] == 0)]
group_mean = group['Fare'].mean()

test_data['Fare'].fillna(group_mean, inplace=True)

#Assigning zero(0) for missing cabin values
test_data['Cabin'].fillna(0, inplace=True)

test_data['Family_Size'] = train_data['SibSp'] + train_data['Parch'] + 1

print(test_data.dtypes)

In [None]:
#Convert categorical variables to numeric

train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked'])
test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked'])


In [None]:
#Function to clean clean datasets

def name_clean(Name):
    names_clean = " ".join([name.strip(".,()\"'][") for name in Name.split(" ")])
    return names_clean

def ticket_split(Ticket):
    split = Ticket.split(" ")
    tick = split[-1]
    try:
        return int(tick)
    except ValueError:
        return 0

def ticket_split2(Ticket):
    split = Ticket.split(" ")
    if len(split) == 1:
        return None
    return "".join(split[0:-1])

In [None]:
#Make copy of datasets to clean
train_data_clean = train_data.copy()

test_data_clean = test_data.copy()


In [None]:
#Train data clean

train_data_clean['Name'] = train_data_clean['Name'].apply(name_clean)
train_data_clean['Ticket_Number'] = train_data_clean['Ticket'].apply(ticket_split)
train_data_clean['Ticket_Code'] = train_data_clean['Ticket'].apply(ticket_split2)

train_data_clean.head()

In [None]:
#Test Data Clean

test_data_clean['Name'] = test_data_clean['Name'].apply(name_clean)
test_data_clean['Ticket_Number'] = test_data_clean['Ticket'].apply(ticket_split)
test_data_clean['Ticket_Code'] = test_data_clean['Ticket'].apply(ticket_split2)

test_data_clean.head()

In [None]:
#ocolumns to int
train_data_clean['New_Age1'].fillna(0, inplace=True)
test_data_clean['New_Age1'].fillna(0, inplace=True)

#Train_data
train_data_clean['New_Age1'] = train_data_clean['New_Age1'].astype('int')
train_data_clean['Fare'] = train_data_clean['Fare'].astype('int')

#Test Data
test_data_clean['New_Age1'] = test_data_clean['New_Age1'].astype('int')
test_data_clean['Fare'] = test_data_clean['Fare'].astype('int')


In [None]:
#Select needed columns
feature_columns = ['Pclass', 'Sex_female', 'Sex_male', 'New_Age1', 'Fare', 'Family_Size', 'Embarked_C', 'Embarked_Q', 'Embarked_S', 'Ticket_Number']
X_train = train_data_clean[feature_columns]

y_train = train_data_clean['Survived']

X_test = test_data_clean[feature_columns]

print(X_train.dtypes, '\n\n', y_train.dtypes, '\n\n', X_test.dtypes)

# **Model Training and Evaluation**

In [None]:
#Initializing the Logistic Regression model

model = LogisticRegression()

#Train model with training data

model.fit(X_train, y_train)

#Predictions on test data based on training

y_predict = model.predict(X_test)

#Model Evaluations
#Model_accuracy = accuracy_score(y_test, y_predict)
#Conf_matrix = confusion_matrix(y_test, y_predict)
#Class_report = classification_report(y_test, y_predict)

result = pd.DataFrame({'PassengerId': test_data_clean.PassengerId, 'Survived': y_predict})
result.to_csv('submission.csv', index=False)
print("result successfully saved!")