In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
            
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# *Dataset Cleaning and Transformation*
   
   Objective: Clean `raw_df` and decode original and transformed variables from an raw initial condition a more interpretable way by following initial defintions this will set the conditions for a robust `data-imputation`.

# Original Variables:
 * `raw_df['PClass]`: decoded to upper, middle and lower socioeconomic class.
 * `raw_df['Parch']`: Supposing a hierarchical family structure, vertical family members are composed by parents and children.
 * `raw_df['Sibsp']`: Supposing a hierarchical family structure, lateral family members are brothers, sisters, stepbrothers, stepsisters and spouse.
 
# Transformed Variables:

* `raw_df['Cabin']`: Get total number of purchased cabins and the section where the passanger were hosted.


   
                                  Image 1.0: Titanic Sections 
![](https://upload.wikimedia.org/wikipedia/commons/5/5d/Titanic_side_plan_annotated_English.png)


* `raw_df['Name']`: Decode if a passanger is a married woman and if the passanger was a boy.

    If `Mrs`. found in the `raw_df['Name']` ---> the passanger is a married female.
    
    If `Master` found in the `raw_df['Name']` --> the passanger is a boy.
    
    
    For reference: https://dictionary.cambridge.org/es/diccionario/ingles/mrs
                   https://en.wikipedia.org/wiki/Master_(form_of_address)


# Data Imputation #

* For numerical variables: As it is a small sample, it is not a good idea to suppose a gaussian-generation-process, this makes the `.median()` a robust statistic for each respective column.

* For categorical variables: If a missing value is present then replace it for the more frequent column term. 

In [None]:
####################################################
#                                                  #
#  Importing all required modules                  #
#                                                  #
####################################################

!pip install bs4

############## operative system treatment ###########
import os
from os import path
import os.path
from datetime import datetime

############### database binding ##################
import string
import pandas as pd
import numpy as np
import re

############## data visualization ###################
import matplotlib
import matplotlib.pyplot as plt
from pandas.plotting import scatter_matrix
from pandas import plotting
from pandas.plotting import scatter_matrix
import seaborn as sns

##### to vectorize the dataset ###########
from sklearn.preprocessing import LabelEncoder
from math import log2
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler


##### to scrap some web data #############
import requests
from bs4 import BeautifulSoup
import lxml


##### for machine learning testing ########
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.linear_model import ElasticNetCV



print("All required modules imported to workspace!")

In [None]:
#############################
#      Column Layers        #
#                           #
#############################

numerical_columns = ['age', 'lateral_family_members', 
                     'vertical_family_members', 'fare', 
                     'total_number_of_purchased_cabins']
        
categorical_columns = ['sex', 
                        'cabin', 
                        'port_of_embarkation']

########################### 
#                         #
#  Continuous Attributes  #
###########################

numerical_attributes=['age', 'lateral_family_members', 'vertical_family_members', 'fare', 'total_number_of_purchased_cabins']


print("Succesful setup for Column layers!")

# Scraping Economic Data #

* In order to get the real value (deflacted by inflation) for Fare column we get inflation data in `data_url`.

* The following formula will help to get from raw_fare to real_fare:


* For this case the interest rate will be the inflation.

In [None]:

data_url='https://www.macrotrends.net/countries/GBR/united-kingdom/inflation-rate-cpi'

def load_historical_inflation_rate(url, inflation_ts):
    """
    .url == the website containing inflation rate for UK
    .inflation_rate_ts = time_series pandas dataframe
    """
    # request and beautify
    url=requests.get(url)
    soup=BeautifulSoup(url.text, 'lxml')
    table = soup.find_all('table')[1]
    
    # get column names
    columns = table.find_all('th', {'style':'text-align:center;'})
    colnames=[hdr.text for hdr in columns][1:]
    
    # Now extracting the values
    data = {k:[] for k in colnames}
    rows = table.find('tbody').find_all('tr')
    
    for col in colnames:
        for rw in rows:
            
            td=rw.find_all('td')
            row = [rw.text for rw in td]
            data[col].append(row)
            
    df = pd.DataFrame.from_dict(data)
    inflation_ts = pd.concat([inflation_ts, df], ignore_index=True)
    
    # set values #
     # .iloc refers to the column index
    inflation_ts.iloc[:,0] = inflation_ts.iloc[:,0].str[0]
    inflation_ts.iloc[:,1] = inflation_ts.iloc[:,1].str[1]
    inflation_ts.iloc[:,2] = inflation_ts.iloc[:,2].str[2]
    
    # converting to percentages 
      # from string to float
    inflation_ts.iloc[:,1] =  inflation_ts.iloc[:,1].str.rstrip('%').astype('float') / 100.0
    inflation_ts.iloc[:,2] = inflation_ts.iloc[:,2].str.rstrip('%').astype('float') / 100.0
    
    return inflation_ts


# calling scraper
inflation_ts = pd.DataFrame()
inflation_ts = load_historical_inflation_rate(url=data_url, inflation_ts=inflation_ts)

# inspect the dataframe
inflation_ts.tail(10)


In [None]:
def get_real_fare_price(raw_price):
    
    # get parameters
    inflation_rate=inflation_ts[inflation_ts['Year']=='1970']['Inflation Rate (%)'].values.astype('float')
    years=datetime.now().year-1970
    
    # do calculation
    real_price  = raw_price*((1+(inflation_rate)) ** years)
    real_price=float(real_price)
    
    return real_price

In [None]:
#####################################################
#                                                   #
#    Loading, Cleaning and Transforming Dataset     #
#                                                   # 
##################################################### 


##################### LOADING  ######################

class CLEANER():
    
    def __init__(self):
        self.initial_path = '/kaggle/input/titanic'


    def load_titanic_dataset(self):
        
        # initial path
        titanic_dataset_path=self.initial_path
        
        # naming paths
        train_path=os.path.join(titanic_dataset_path, 'train.csv')
        test_path=os.path.join(titanic_dataset_path, 'test.csv')

        # loading datasets
        train = pd.read_csv(train_path, header=0, na_values='NA')
        test=pd.read_csv(test_path, header=0, na_values='NA')

        # return full dataframe

        return pd.concat([train, test])
    
######################################################

    def string_split(self, row):
         if row is np.nan:
            return np.nan
         else:
            return len(row.split())
        
  ##################################################      
     
    def decode_pclass_column(self, x):
        if x== 1:
            return 'upper_socioeconomic'
        elif x==2:
            return 'middle_socioeconomic'
        elif x==3:
            return 'lower_socioeconomic'
        else:
            return 'NA'
    

    
########## CLEANING AND TRANSFORMING #################

    def clean_and_transform_df(self,raw_df):

        ################ cleaning whitespaces ##############

        raw_df.columns = raw_df.columns.str.replace(' ', '')

        ############### transforming column values #########

        # extract Cabin location and get the total number of cabins associated with an unique passanger


        # total number of cabins
        raw_df['total_number_of_purchased_cabins']=raw_df.apply(lambda x: self.string_split(x["Cabin"]), axis=1)

        # cabin location
        raw_df['Cabin']=raw_df['Cabin'].str.get(0)
        raw_df['Cabin']=raw_df['Cabin'].str.get(0)

        #  is_married_female
        raw_df['is_married_female'] = np.where(raw_df['Name'].str.contains('Mrs'), 'married', 'not_married')

        # is boy
        raw_df['is_boy'] = np.where(raw_df['Name'].str.contains('Master'), 'is_boy', 'not_boy')

        # socioeconomic class 
        # TO DO: get ratios instead of one-hot-encoding

        raw_df['Pclass']=raw_df.apply(lambda x: self.decode_pclass_column(x['Pclass']), axis=1)
        
        ###################### changing column names ##################
        
        # Objective: get more interpretable features.
        
        # str_lower
        raw_df.columns = map(str.lower, raw_df.columns)
        
        raw_df.rename({'pclass': 'socio_economic_status', 
                       'sibsp': 'lateral_family_members',
                       'parch':'vertical_family_members',
                       'embarked':'port_of_embarkation'}, axis=1, inplace=True)

        ##################### data imputation #########################
        
        global numerical_columns
        global categorical_columns
        
        numerical_columns = numerical_columns
        
        categorical_columns = categorical_columns
        """
        .median= a proxy mean estimator
        .mode= most frequent category 
        """
        for numerical_column in numerical_columns:
            median= raw_df[numerical_column].median(skipna=True).round(0)
            raw_df.fillna(median) 

        for categorical_column in categorical_columns:
            mode= raw_df[categorical_column].mode()
            raw_df.fillna(mode) 
        
        ####### get real value for fare ##########
          # real value in pounds
            
        raw_df['fare'] = raw_df['fare'].apply(get_real_fare_price)
        
        ###### data vectorization ################
        
        # binary encoding 
        is_female_var = raw_df['sex'].replace(['female','male'],[1,0], inplace=True)
        is_boy_var = raw_df['is_boy'].replace(['is_boy', 'not_boy'], [1, 0], inplace=True)
        is_married_female_var = raw_df['is_married_female'].replace(['married', 'not_married'], [1,0], inplace=True)
        
        
         ## One-Hot Encoding ##

        raw_df= pd.get_dummies(data=raw_df, columns=['socio_economic_status', 'port_of_embarkation','cabin'], prefix=['class', 'port_of_embarkation',  'cabin'])

        
        # select numerical attributes
        
        global numerical_attributes
        

        
        # perform normalization
        min_max_scaler = MinMaxScaler()
        raw_df[numerical_attributes] = min_max_scaler.fit_transform(raw_df[numerical_attributes])
       
        return raw_df



if __name__=="__main__":   
   
    cleaner= CLEANER()
    raw_df= cleaner.load_titanic_dataset()
    main_df= cleaner.clean_and_transform_df(raw_df=raw_df)
   

    print('main_df is ready for feature selection')

# *Exploratory Data Analysis* #
              
* Category Attribute/ Category Attribute ----> Within Survivors: percentage of category survivor.
* Category Attribute/ Numerical Attribute ---> Within Survivors: Boxplot
* Numerical Attribute / Numerical Attribute --> Search for linear relationships.

In [None]:
main_df.describe()

In [None]:
main_df.info()

# Excluding string `dtypes()`
   * In order to vectorize the dataset

In [None]:
main_df = main_df.select_dtypes(exclude=['object'])


print("main_df is now fully vectorized!")

In [None]:
###################################
#                                 #
#   Exploratory Data Analysis     #
#                                 #
###################################


########## Numerical columns ######

numerical_attributes=['age', 'lateral_family_members', 'vertical_family_members', 'fare', 'total_number_of_purchased_cabins']

scatter_matrix= pd.plotting.scatter_matrix(main_df[numerical_attributes], figsize  = [15, 15],diagonal = "kde" )

for ax in scatter_matrix.ravel():
    ax.set_xlabel(ax.get_xlabel(), fontsize = 7)
    ax.set_ylabel(ax.get_ylabel(), fontsize = 7)
    
    
# Conclusion: no linear relationships between variables (low risk of multicollinearity for estimators)
####################################


In [None]:
########################## BoxPlots ################################


figure, axes = plt.subplots(2, 3, sharex=True, figsize=(18,15))

sns.boxplot(x='survived', y='fare', data=main_df, ax=axes[0,0])
sns.boxplot(x='survived', y='age', data=main_df, ax=axes[0,1])
sns.boxplot(x='survived', y='total_number_of_purchased_cabins', data=main_df, ax=axes[0,2])
sns.boxplot(x='survived', y='lateral_family_members', data=main_df, ax=axes[1,0])
sns.boxplot(x='survived', y='vertical_family_members', data=main_df, ax=axes[1,1])



In [None]:
#figure, axes = plt.subplots(2, 3, sharex=True, figsize=(25,15))
#plt.subplots_adjust(left=1, bottom=1, right=2, top=2, wspace=1, hspace=1)
#sns.set(font_scale=5) 

#sns.barplot(x='survived', y='sex', data=main_df, estimator=lambda x: sum(x==1)*100.0/len(x),ax=axes[0,0]).set(xlabel='Percentage of Survivors', ylabel='Sex')
#sns.barplot(x='survived', y='cabin', data=main_df,estimator=lambda x: sum(x==1)*100.0/len(x) ,ax=axes[0,1]).set(xlabel='Percentage of Survivors', ylabel='Cabin Section')
#sns.barplot(x='survived', y='port_of_embarkation',estimator=lambda x: sum(x==1)*100.0/len(x) ,data=main_df, ax=axes[0,2]).set(xlabel='Percentage of Survivors', ylabel='Port of Embarkation')
#sns.barplot(x='survived', y='is_boy', data=main_df,estimator=lambda x: sum(x==1)*100.0/len(x) ,ax=axes[1,0]).set(xlabel='Percentage of Survivors', ylabel='Population: Boys')
#sns.barplot(x='survived', y='is_married_female',data=main_df,estimator=lambda x: sum(x==1)*100.0/len(x)  ,ax=axes[1,1]).set(xlabel='Percentage of Survivors', ylabel='Population: Married Female')
#sns.barplot(x='survived', y='socio_economic_status',data=main_df,estimator=lambda x: sum(x==1)*100.0/len(x)  ,ax=axes[1,2]).set(xlabel='Percentage of Survivors', ylabel='Economic Status')




# Feature Engineering #

*Entropy:*


  * If equal to 0 then all data is on one class. --> Lower Uncertainty
  
  
  * Higher entropy will be associated with more weight over positive cases (survived= 1) --> Higher Uncertainty
 
  
**![](https://wikimedia.org/api/rest_v1/media/math/render/svg/bfe3616dee43f6287d4a4e2a557de8d48ad24926)**

   Where `i=1,0`.
   

In [None]:
######### Entropy Between Classes #########

class_proportion=main_df['survived'].value_counts()/main_df['survived'].count()

class0=class_proportion[0]
class1=class_proportion[1]

# calculate entropy
entropy = -(class0 * log2(class0) + class1 * log2(class1))
# print the result
print('entropy: %.3f bits' % entropy)

# very unordered classes: higher uncertainty


# caution: overfitting is a risk.

# Train/ Test Split
   * drop passangerid
   * select all not na rows
   * 70/30 % (train/test split)

In [None]:
main_df=main_df.drop('passengerid', axis=1)
main_df = main_df[main_df['survived'].notna()]

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

# defining y_train, x_train, y_test and x_test
# 70% for training and 30% for validation 
X = main_df.drop('survived', axis=1)
y= main_df[['survived']]


x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)


# for features
# filling nan's with overall respective column mean 
x_train=x_train.fillna(x_train.mean())
x_test=x_test.fillna(x_test.mean())




print("Succesful main_df split into train and test samples!")

# Selection Criteria #


* MIC (Mutual Information Criteria) : Survived [0,1] over numerical features.
* Chi-squared test : Survived [0,1] over each binary variable (dummy variable)

In [None]:
from sklearn.metrics import mutual_info_score
from sklearn.feature_selection import SelectKBest, f_classif

#numerical_features=['col1', 'col2']
numerical_features=['age', 'lateral_family_members', 'vertical_family_members', 'fare', 'total_number_of_purchased_cabins']

########################################################################

def get_numerical_features(features, class_label):
    
    #class_label is already a Dataframe in your data demo
    fs=SelectKBest(f_classif, k='all')
    fs.fit(features, class_label) # this should be here 

    for i, feature in zip(range(len(features)), features): 
        print('Feature %s: %f' % (feature, fs.scores_[i]))
        
        
#######################################################################


get_numerical_features(features=x_train[numerical_features], class_label=y_train['survived']) 


In [None]:
# Now let's test hyphotesis over the binary columns.

# collect the binary features:
binary_features = x_train.drop(numerical_features, 1)



In [None]:
# perform the test 
from sklearn.feature_selection import chi2

chi_scores = chi2(binary_features, y_train)
chi_scores

In [None]:
p_values = pd.Series(chi_scores[1], index = binary_features.columns)
p_values.sort_values(ascending = False , inplace = True)



p_values.plot.bar()

# For logistic-regression the next variables will be selected:


`numerical_features`: age, fare, lateral_family_members, vertical_family_members


`binary_features`: cabin_A, cabin_G, port_of_embarkation_Q
    

In [None]:
# Selecting Features
#x_train=x_train[['age', 'fare', 'lateral_family_members', 'vertical_family_members', 'cabin_A', 'cabin_G', 'port_of_embarkation_Q']]
#x_test=x_test[['age', 'fare', 'lateral_family_members', 'vertical_family_members', 'cabin_A', 'cabin_G', 'port_of_embarkation_Q']]

print("Succesful feature selection!")

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg=LogisticRegression(penalty='elasticnet', solver='saga', C=0.01, l1_ratio=0.07316807143427177)
log_reg.fit(x_train,  y_train.values.ravel())

In [None]:
y_pred = log_reg.predict(x_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(log_reg.score(x_test, y_test)))

In order to improve results, let's try distinct regressions:

   * LASSO Regression (`|L1|` regularization) 
   * Ridge Regression (`|L2|` regularization)
   * Elastic-Net Regression (weighted `|L1|` and `|L2|` regularization)

# Ridge Regression

In [None]:
# Creación y entrenamiento del modelo (con búsqueda por CV del valor óptimo alpha)
# ==============================================================================
# Por defecto RidgeCV utiliza el mean squared error
modelo = RidgeCV(
            alphas          = np.logspace(-10, 2, 200),
            fit_intercept   = True,
            normalize       = True,
            store_cv_values = True
         )

_ = modelo.fit(X = x_train, y = y_train)

print("Succesful setup for Ridge Regression!")



In [None]:
alphas = modelo.alphas
coefs = []

for alpha in alphas:
    modelo_temp = Ridge(alpha=alpha, fit_intercept=False, normalize=True)
    modelo_temp.fit(x_train, y_train)
    coefs.append(modelo_temp.coef_.flatten())

fig, ax = plt.subplots(figsize=(7, 3.84))
ax.plot(alphas, coefs)
ax.set_xscale('log')
ax.set_xlabel('alpha')
ax.set_ylabel('coeficientes')
ax.set_title('Coeficientes del modelo en función de la regularización');
plt.axis('tight')
plt.show()

In [None]:
# Mejor valor alpha encontrado
# ==============================================================================
print(f"Mejor valor de alpha encontrado: {modelo.alpha_}")

In [None]:
# Coeficientes del modelo
# ==============================================================================
df_coeficientes = pd.DataFrame(
                        {'predictor': x_train.columns,
                         'coef': modelo.coef_.flatten()}
                  )

fig, ax = plt.subplots(figsize=(11, 3.84))
ax.stem(df_coeficientes.predictor, df_coeficientes.coef, markerfmt=' ')
plt.xticks(rotation=90, ha='right', size=5)
ax.set_xlabel('variable')
ax.set_ylabel('coeficientes')
ax.set_title('Coeficientes del modelo');