In [167]:
# data analysis and wrangling
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport

# visualization
import pydotplus
from sklearn.tree import export_graphviz
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# machine learning
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.ensemble import GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.decomposition import FactorAnalysis

#preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import OrdinalEncoder


import warnings
warnings.filterwarnings('ignore')


# I. Acquire data

The Python Pandas packages helps us work with our datasets. We start by acquiring the training and testing datasets into Pandas DataFrames. We also combine these datasets to run certain operations on both datasets together.

In [168]:
train_data = pd.read_csv('.\\titanic\\train.csv')
test_data = pd.read_csv('.\\titanic\\test.csv')

# II. Exploratory Data Analysis

# III. Let's build our first model : baseline

# IV. Let's build improve our model



## a. Preprocessing

In [169]:
train_df = train_data.copy()
test_df = test_data.copy()

#Preprocessing : with mean for train
train_df['Age'].fillna(train_df['Age'].mean(),inplace=True)
train_df['Embarked'].fillna('X',inplace=True)
train_df['Cabin'].fillna('XX',inplace=True)
train_df['Sex'] = train_df['Sex'].map({'female':0,'male':1})

#categories1 = [['Sex']]
#for cat in categories1:
#    lb = OrdinalEncoder()
#    lb.fit(X[cat])
#    X[cat] = lb.transform(X[cat])
#    X_test[cat] = lb.transform(X_test[cat])

    
    
#Preprocessing : with mean for test (don't use the statistic of the test in the train!!!!!!!)
test_df['Age'].fillna(test_df['Age'].mean(),inplace=True)
test_df['Embarked'].fillna('X',inplace=True)
test_df['Cabin'].fillna('XX',inplace=True)
test_df['Sex'] = test_df['Sex'].map({'female':0,'male':1})
test_df['Fare'].fillna(test_df['Fare'].mean(),inplace=True)

#For difference between get_dummies and OneHotEncoder
#https://stackoverflow.com/questions/36631163/pandas-get-dummies-vs-sklearns-onehotencoder-what-are-the-pros-and-cons
#One Hot encoding
#categories2 = ['Embarked']
#print(pd.get_dummies(X, columns=categories2))


encoder = OneHotEncoder(handle_unknown="ignore")
encoder.fit(train_df[['Embarked']])    # Assume for simplicity all features are categorical.
# Apply the encoder for train
a = encoder.transform(train_df[['Embarked']])
other = pd.DataFrame(data=a.toarray(),columns=['Embarked_1','Embarked_2','Embarked_3','Embarked_4'])
train_df = train_df.join(other,lsuffix='_caller', rsuffix='_other')

# Apply the encoder for test
a = encoder.transform(test_df[['Embarked']])
other = pd.DataFrame(data=a.toarray(),columns=['Embarked_1','Embarked_2','Embarked_3','Embarked_4'])
test_df = test_df.join(other,lsuffix='_caller', rsuffix='_other')

In [170]:
X = train_df[['Pclass', 'Sex','Age','SibSp','Parch','Fare',
              'Embarked_1','Embarked_2','Embarked_3','Embarked_4']]

sub_test = test_df[['Pclass', 'Sex','Age','SibSp','Parch','Fare',
              'Embarked_1','Embarked_2','Embarked_3','Embarked_4']]


## b. Factor Analysis

In [173]:
#https://www.datacamp.com/community/tutorials/introduction-factor-analysis
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity,calculate_kmo
from factor_analyzer import FactorAnalyzer

X = train_df[['Fare','SibSp','Parch','Sex']]
sub_test = test_df[['Fare', 'SibSp','Parch','Sex']]
y = train_df['Survived']

# Bartlett’s Test of Sphericity compares an observed correlation matrix to 
# the identity matrix. Essentially it checks to see if there is a certain 
# redundancy between the variables that we can summarize with a few number of factors. 
# https://www.statology.org/a-guide-to-bartletts-test-of-sphericity/
# https://easystats.github.io/parameters/reference/check_sphericity.html

chi_square_value,p_value = calculate_bartlett_sphericity(X)
print("Bartlet Test(p_value sould be near from 0): ",chi_square_value, p_value)

kmo_all,kmo_model=calculate_kmo(X)
print("KMO Test (should be > 0.6):", kmo_model)

Bartlet Test(p_value sould be near from 0):  291.60113582596244 2.541419377457986e-60
KMO Test (should be > 0.6): 0.6029796205259037


In [174]:
#Latent variable
#https://factor-analyzer.readthedocs.io/en/latest/factor_analyzer.html
fa = FactorAnalyzer(rotation="varimax",n_factors=3)
fa.fit(X)

#Check Eigenvalues
ev, v = fa.get_eigenvalues()
print("Contribution is the initial variables\n", fa.loadings_)
print("Eigne values: Important are > 1:\n", ev)

Contribution is the initial variables
 [[ 0.14604528  0.21656252  0.36871657]
 [ 0.61304291  0.04644088  0.16288961]
 [ 0.61445101  0.36012295  0.13153666]
 [-0.10131338 -0.43648769 -0.19800565]]
Eigne values: Important are > 1:
 [1.68733458 0.93051801 0.82114509 0.56100231]


In [175]:
type(X)

pandas.core.frame.DataFrame

In [164]:
fa = FactorAnalyzer(rotation="varimax",n_factors=2)
fa.fit(X)

#Case 1 : only latent variables
X = fa.transform(X)
sub_test = fa.transform(sub_test)

#https://towardsdatascience.com/why-feature-correlation-matters-a-lot-847e8ba439c4
#https://statisticsbyjim.com/regression/multicollinearity-in-regression-analysis/
#https://www.kaggle.com/reisel/how-to-handle-correlated-features
#https://towardsdatascience.com/feature-selection-correlation-and-p-value-da8921bfb3cf

#Case 2 : study of correlation
#X = np.concatenate((X,fa.transform(X)),axis=1)
#sub_test = np.concatenate((sub_test,fa.transform(sub_test)),axis=1)

#X.merge(fa.transform,how='right')
#pd.DataFrame(data=fa.transform(X))
#A = np.concatenate(X,fa.transform(X),axis=1)



In [165]:
pd.DataFrame(data=X).corr()

Unnamed: 0,0,1
0,1.0,7.33434e-09
1,7.33434e-09,1.0


In [166]:
cv = StratifiedKFold(n_splits=5,random_state=10,shuffle=True)

results_df_test = pd.DataFrame()
results_df_train = pd.DataFrame(data=np.zeros((5,2)),columns=['Train_error', 'Test_error'])

fold=0

for train_index, test_index in cv.split(X, y):
    X_train = X[train_index]
    y_train = y[train_index]

    X_test = X[test_index]
    y_test = y[test_index]

    model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=2)
    model.fit( X_train,  y_train)

    pred_sub   = model.predict_proba(sub_test)[:,1]
    results_df_test['fold_'+str(fold)] = pred_sub
    
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    score = 1 - accuracy_score(y_train,pred_train)
    results_df_train.loc[fold,'Train_error'] = round(score*100,2)
    
    score = 1 - accuracy_score(y_test,pred_test)    
    results_df_train.loc[fold,'Test_error'] = round(score*100,2)
    
    
    
    fold +=1

#Mean strategy
preds = (results_df.mean(axis=1) >=0.5).astype(int)

my_final_sub = pd.read_csv('.\\titanic\\test.csv')[['PassengerId']]
my_final_sub['Survived'] = preds

my_final_sub.to_csv('submission_fa.csv', index=False)
#Kaggle Score : 0.76076

In [161]:
1-model.score(X,y)

0.14927048260381592

In [162]:
results_df_train.mean()

Train_error    14.086
Test_error     19.416
dtype: float64

In [163]:
# Two variables : acc = 0.149, Train_error = 14.086, Test_error = 19.864
model.feature_importances_

array([0.24036259, 0.75963741])