<a href="https://colab.research.google.com/github/aaldayarova/titanic-survival/blob/main/titanic_aml_hw1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'titanic:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F3136%2F26502%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240918%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240918T153015Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D764f21ba7a802d15feeff282083811901cc5fc62d1c3ca1e665927e3c3128eb7dcc4fe73ad710a3fbd3ca56fe1d4d7c012d2b1f75a1e887a6129dd7f8b3cbdb44b42917a3cc5b41eb49d18311f4178c90d5161934e691e1ff1d371c409ed08871626ab2f3b159a20e1ebbcf0da6366d2bb3145d99b9f6a50a5b0498b7674b94fba509a46526f897e699dadda225af2835a97eed3d68bed375822e2ff431dde2407956ae4e90b34c5146a7674e0744d021529338733599e1e3553122365f422fa5ca467eafa8a79c4dd055cf45423d6419e29da32e3040afcd6188c6bc128abc5159c29dc9896a04270c96c706eee3192f7366e32aedb56205aacc36bbc8e7070'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Creating variables for train and test data
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")
print(len(train_data))
print(len(test_data))

In [None]:
# Pre-processing our train and test data
# Step 1: Drop the 'Name', 'Fare', 'Ticket', and 'Cabin' fields; we feel they are not necessary for predictions and/or are unique IDs
train_data = train_data.drop(["Name", "Fare", "Ticket", "Cabin"], axis=1)
test_data = test_data.drop(["Name", "Fare", "Ticket", "Cabin"], axis=1)
print(len(train_data))
print(len(test_data))
# train_data.head()

In [None]:
# Step 2: Handle missing values; we will replace any null/NA values with mean of its column
# To do this, let's check which columns in our dataset have any such values
null_columns_train = train_data.isnull().any()
null_columns_test = test_data.isnull().any()

# Display columns with null values
columns_with_nulls_train = null_columns_train[null_columns_train].index
print("Columns with null values in training set:", columns_with_nulls_train.tolist())

columns_with_nulls_test = null_columns_test[null_columns_test].index
print("Columns with null values in testing set:", columns_with_nulls_test.tolist())

In [None]:
# Update missing values in 'Age' with the mean of the column
train_data.fillna({'Age': train_data['Age'].mean()}, inplace=True)
test_data.fillna({'Age': test_data['Age'].mean()}, inplace=True)

In [None]:
# Drop missing values in 'Embarked', as they may cause errors in prediction if not dropped
train_data = train_data.dropna()

In [None]:
# Sanity check to ensure no more missing values now
null_columns_train = train_data.isnull().any()
null_columns_test = test_data.isnull().any()

# Display columns with null values
columns_with_nulls_train = null_columns_train[null_columns_train].index
print("Columns with null values in training set:", columns_with_nulls_train.tolist())

columns_with_nulls_test = null_columns_test[null_columns_test].index
print("Columns with null values in testing set:", columns_with_nulls_test.tolist())

In [None]:
# Step 3: Perform OHE on categorical fields
encoded_train_data = pd.get_dummies(train_data, columns=['Sex', 'Embarked',])
encoded_test_data = pd.get_dummies(test_data, columns=['Sex', 'Embarked',])

In [None]:
# Step 4: Normalize numerical data; we will be using min-max scaling
# Copy data
train_min_max_scaled = encoded_train_data.copy()
test_min_max_scaled = encoded_test_data.copy()

# Function for normalizing the data using min-max scaling
def minMaxScale(df, column):
  df[column] = (df[column] - df[column].min()) / (df[column].max() - df[column].min())

# Normalizing the training data
minMaxScale(train_min_max_scaled, 'Pclass')
minMaxScale(train_min_max_scaled, 'Age')
minMaxScale(train_min_max_scaled, 'SibSp')
minMaxScale(train_min_max_scaled, 'Parch')

# Normalizing the testing data
minMaxScale(test_min_max_scaled, 'Pclass')
minMaxScale(test_min_max_scaled, 'Age')
minMaxScale(test_min_max_scaled, 'SibSp')
minMaxScale(test_min_max_scaled, 'Parch')

# View normalized data
# display(train_min_max_scaled)
# display(test_min_max_scaled)

In [None]:
# Implementing logistic regression using sklearn library
from sklearn.linear_model import LogisticRegression

# Initialize the model and our pre-processed dataset; also rename testing data for consistency sake
our_log_reg = LogisticRegression(max_iter=1000)
X_train = train_min_max_scaled.drop('Survived', axis=1)
y_train = train_min_max_scaled['Survived']
X_test = test_min_max_scaled

# Train the model on our training data
our_log_reg.fit(X_train, y_train)

# Predict targets of testing data
y_test_pred = our_log_reg.predict(X_test)

# View our predictions
passenger_ids = test_data['PassengerId']
output = pd.DataFrame({'PassengerId': passenger_ids, 'Survived': y_test_pred})
# output
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")