In [1]:
# Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="darkgrid")
import os
from dotenv import load_dotenv
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


In [2]:
load_dotenv()

True

In [3]:
url = os.getenv("url")

In [4]:
# read data
def get_data(file_path):
    """Reading data from a CSV file and returning the first 10 rows."""
    try:
        data = pd.read_csv(file_path)
    except FileNotFoundError as e:
        print(f"Error reading {file_path}: {e}")
    else:
        print(f"Successfully read data from source")
        return data.head(10)
    finally:
        print("Data loading attempt finished.")

In [5]:
def save_data(data, file_path):
    """Saving data to a CSV file."""
    try:
        data.to_csv(file_path, index=False)
    except Exception as e:
        print(f"Error saving data to directory: {e}")
    else:
        print(f"Data saved successfully to directory")    

In [6]:
data = get_data(url)
data

Successfully read data from source
Data loading attempt finished.


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,693,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S,1
1,482,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,S,0
2,528,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S,0
3,856,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,,S,1
4,802,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.25,,S,1
5,653,3,"Kalvik, Mr. Johannes Halvorsen",male,21.0,0,0,8475,8.4333,,S,0
6,510,3,"Lang, Mr. Fang",male,26.0,0,0,1601,56.4958,,S,1
7,558,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C,0
8,829,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q,1
9,19,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S,0


In [7]:
save_data(data, "../data/titanic.csv")

Data saved successfully to directory


In [8]:
def get_description(data):
    """Get a statistical summary of the dataset."""
    try:
        info = data.info()
        stats = data.describe(include='all')
    except Exception as e:
        print(f"Error getting Statistics: {e}")
    else:
        return info, stats, data.shape
    finally:
        print("Data description attempt finished.")

In [9]:
get_description(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  10 non-null     int64  
 1   Pclass       10 non-null     int64  
 2   Name         10 non-null     object 
 3   Sex          10 non-null     object 
 4   Age          5 non-null      float64
 5   SibSp        10 non-null     int64  
 6   Parch        10 non-null     int64  
 7   Ticket       10 non-null     object 
 8   Fare         10 non-null     float64
 9   Cabin        1 non-null      object 
 10  Embarked     10 non-null     object 
 11  Survived     10 non-null     int64  
dtypes: float64(2), int64(5), object(5)
memory usage: 1.1+ KB
Data description attempt finished.


(None,
         PassengerId     Pclass          Name   Sex       Age      SibSp  \
 count     10.000000  10.000000            10    10   5.00000  10.000000   
 unique          NaN        NaN            10     2       NaN        NaN   
 top             NaN        NaN  Lam, Mr. Ali  male       NaN        NaN   
 freq            NaN        NaN             1     7       NaN        NaN   
 mean     593.000000   2.400000           NaN   NaN  25.40000   0.200000   
 std      244.104258   0.843274           NaN   NaN   5.85662   0.421637   
 min       19.000000   1.000000           NaN   NaN  18.00000   0.000000   
 25%      514.500000   2.000000           NaN   NaN  21.00000   0.000000   
 50%      605.500000   3.000000           NaN   NaN  26.00000   0.000000   
 75%      774.750000   3.000000           NaN   NaN  31.00000   0.000000   
 max      856.000000   3.000000           NaN   NaN  31.00000   1.000000   
 
             Parch Ticket        Fare Cabin Embarked   Survived  
 count   10.0

In [10]:
def transform_columns(data):
    """Transform columns by removing spaces and converting to lowercase."""
    try:
        data.columns = [col.strip().lower().replace(" ", "_") for col in data.columns]
    except Exception as e:
        print(f"Error transforming columns: {e}")
    else:
        print("Columns transformed successfully.")
        return data
    finally:
        print("Column transformation attempt finished.")

In [11]:
data = transform_columns(data)
data

Columns transformed successfully.
Column transformation attempt finished.


Unnamed: 0,passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,693,3,"Lam, Mr. Ali",male,,0,0,1601,56.4958,,S,1
1,482,2,"Frost, Mr. Anthony Wood ""Archie""",male,,0,0,239854,0.0,,S,0
2,528,1,"Farthing, Mr. John",male,,0,0,PC 17483,221.7792,C95,S,0
3,856,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,,S,1
4,802,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.25,,S,1
5,653,3,"Kalvik, Mr. Johannes Halvorsen",male,21.0,0,0,8475,8.4333,,S,0
6,510,3,"Lang, Mr. Fang",male,26.0,0,0,1601,56.4958,,S,1
7,558,1,"Robbins, Mr. Victor",male,,0,0,PC 17757,227.525,,C,0
8,829,3,"McCormack, Mr. Thomas Joseph",male,,0,0,367228,7.75,,Q,1
9,19,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,,S,0


In [12]:
def data_quality_report(data):
    """Generate a data quality report."""
    try:
        missing_values = data.isnull().sum()
        duplicates_per_column = data.apply(lambda x: x.duplicated().sum())
        unique_values = data.nunique()
        dtypes = data.dtypes

    except Exception as e:
        print(f"Error generating data quality report: {e}")
    else:
        report = pd.DataFrame({
            'Missing Values': missing_values,
            'Duplicates': duplicates_per_column,
            'Unique Values': unique_values,
            'Data Types': dtypes
        })
        return report
    finally:
        print("Data quality report attempt finished.")

In [13]:
data_quality_report(data)

Data quality report attempt finished.


Unnamed: 0,Missing Values,Duplicates,Unique Values,Data Types
passengerid,0,0,10,int64
pclass,0,7,3,int64
name,0,0,10,object
sex,0,8,2,object
age,5,5,4,float64
sibsp,0,8,2,int64
parch,0,8,2,int64
ticket,0,1,9,object
fare,0,1,9,float64
cabin,9,8,1,object


In [14]:
def preprocess_data(data):
    """Preprocess the data by handling missing values and encoding categorical variables."""
    try:
        # Handling missing values
        data['age'].fillna(data['age'].mean(), inplace=True)
        data['cabin'].fillna(data['cabin'].mode()[0], inplace=True)
    except Exception as e:
        print(f"Error preprocessing data: {e}")
    else:
        print("Data preprocessing completed successfully.")
        return data
    finally:
        print("Data preprocessing attempt finished.")


In [15]:
preprocess_data(data)

Data preprocessing completed successfully.
Data preprocessing attempt finished.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['age'].fillna(data['age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['cabin'].fillna(data['cabin'].mode()[0], inplace=True)


Unnamed: 0,passengerid,pclass,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,survived
0,693,3,"Lam, Mr. Ali",male,25.4,0,0,1601,56.4958,C95,S,1
1,482,2,"Frost, Mr. Anthony Wood ""Archie""",male,25.4,0,0,239854,0.0,C95,S,0
2,528,1,"Farthing, Mr. John",male,25.4,0,0,PC 17483,221.7792,C95,S,0
3,856,3,"Aks, Mrs. Sam (Leah Rosen)",female,18.0,0,1,392091,9.35,C95,S,1
4,802,2,"Collyer, Mrs. Harvey (Charlotte Annie Tate)",female,31.0,1,1,C.A. 31921,26.25,C95,S,1
5,653,3,"Kalvik, Mr. Johannes Halvorsen",male,21.0,0,0,8475,8.4333,C95,S,0
6,510,3,"Lang, Mr. Fang",male,26.0,0,0,1601,56.4958,C95,S,1
7,558,1,"Robbins, Mr. Victor",male,25.4,0,0,PC 17757,227.525,C95,C,0
8,829,3,"McCormack, Mr. Thomas Joseph",male,25.4,0,0,367228,7.75,C95,Q,1
9,19,3,"Vander Planke, Mrs. Julius (Emelia Maria Vande...",female,31.0,1,0,345763,18.0,C95,S,0


In [16]:
def encode(data):
    """Encode categorical variables using one-hot encoding."""
    try:
        categorical_cols = data.select_dtypes(include=['object']).columns
        for col in categorical_cols:
            le = LabelEncoder()
            data[col] = le.fit_transform(data[col])
    except Exception as e:
        print(f"Error encoding categorical variables: {e}")
    else:
        print("Categorical variables encoded successfully.")
        return data
    finally:
        print("Encoding attempt finished.")

In [17]:
def split_data(data):
    """Split the data into features and target variable."""
    try:
        X = data.drop(columns=['survived'], axis=1)
        y = data['survived']
    except Exception as e:
        print(f"Error splitting data: {e}")
    else:
        print("Data splitting completed successfully.")
        return X, y 
    finally:
        print("Data splitting attempt finished.")

In [18]:
split_data(data)

Data splitting completed successfully.
Data splitting attempt finished.


(   passengerid  pclass                                               name  \
 0          693       3                                       Lam, Mr. Ali   
 1          482       2                   Frost, Mr. Anthony Wood "Archie"   
 2          528       1                                 Farthing, Mr. John   
 3          856       3                         Aks, Mrs. Sam (Leah Rosen)   
 4          802       2        Collyer, Mrs. Harvey (Charlotte Annie Tate)   
 5          653       3                     Kalvik, Mr. Johannes Halvorsen   
 6          510       3                                     Lang, Mr. Fang   
 7          558       1                                Robbins, Mr. Victor   
 8          829       3                       McCormack, Mr. Thomas Joseph   
 9           19       3  Vander Planke, Mrs. Julius (Emelia Maria Vande...   
 
       sex   age  sibsp  parch      ticket      fare cabin embarked  
 0    male  25.4      0      0        1601   56.4958   C95        S  


In [19]:
def drop_columns(data):
    """Drop unnecessary columns from the dataset."""
    try:
        columns_to_drop = ['passengerid', 'name', 'ticket', 'cabin']
        data = data.drop(columns=columns_to_drop, axis=1)
    except Exception as e:
        print(f"Error dropping columns: {e}")
    else:
        print("Columns dropped successfully.")
        return data
    finally:
        print("Drop columns attempt finished.")

In [20]:
def select_features(data):
    """Select relevant features for analysis."""
    try:
        data = drop_columns(data)
        X, y = split_data(data)
        X = encode(X)
        selector_mutual_info = SelectKBest(mutual_info_classif, k=5)
        X_selected_mutual_info = selector_mutual_info.fit_transform(X, y)
        selected_features_mutual_info = X.columns[selector_mutual_info.get_support()]
    except Exception as e:
        print(f"Error selecting features: {e}")
    else:
        print("Feature selection completed successfully.")
        return selected_features_mutual_info
    finally:
        print("Feature selection attempt finished.")

In [21]:
selected_features = select_features(data)
selected_features

Columns dropped successfully.
Drop columns attempt finished.
Data splitting completed successfully.
Data splitting attempt finished.
Categorical variables encoded successfully.
Encoding attempt finished.
Feature selection completed successfully.
Feature selection attempt finished.


Index(['pclass', 'sex', 'parch', 'fare', 'embarked'], dtype='object')