In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from scipy import stats
import warnings
warnings.filterwarnings('ignore')
from scipy.stats import skew
from scipy.special import boxcox1p

In [39]:
## Inputting data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

PassengerId = test["PassengerId"]

train.drop(['PassengerId'],axis=1,inplace=True)
test.drop(['PassengerId'],axis=1,inplace=True)

all_data = pd.concat([train.drop('Survived',axis=1),test])


## Converting categorical variables to numerical
all_data = all_data.replace({'male': 0, 'female': 1})
all_data['Family'] = all_data['SibSp']+all_data['Parch']
#all_data['Child'] = all_data['Age'].loc(all_data['Age']<16)
all_data['Child'] = all_data['Age']<16
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index
all_data = all_data[numeric_feats]
all_data.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Child
0,3,0,22.0,1,0,7.25,1,False
1,1,1,38.0,1,0,71.2833,1,False
2,3,1,26.0,0,0,7.925,0,False
3,1,1,35.0,1,0,53.1,1,False
4,3,0,35.0,0,0,8.05,0,False


In [40]:
## Filling missing variables with the median (dumb replacement)
def medianreplace(data):
    k = (data.isna().sum()!=0).sum()
    idx = data.isna().sum().nlargest(k).index[::-1]
    for i in idx:
        data[i] = data[i].fillna(data[i].median())
    
    return data


In [41]:
## Smarter replace using groupby
def groupbymedianreplace(data,feat):
    k = (data.isna().sum()!=0).sum()
    idx = data.isna().sum().nlargest(k).index[::-1]
    for i in idx:
        data[i].fillna(data.groupby(feat)[i].transform("median"), inplace=True)
    
    return data


In [114]:
#all_data = medianreplace(all_data)
all_data = groupbymedianreplace(all_data,['Sex','Pclass'])
#all_data['Age'] = np.log1p(all_data['Age'])
all_data['Age'] = (all_data['Age']-all_data['Age'].mean(axis=0))/all_data['Age'].std(axis=0)
all_data['Fare'] = np.log1p(all_data['Fare'])
all_data['Fare'] = (all_data['Fare']-all_data['Fare'].mean(axis=0))/all_data['Fare'].std(axis=0)
train = pd.concat([all_data.iloc[0:train.shape[0]],train['Survived']],axis=1)
test = all_data.iloc[train.shape[0]:]

In [115]:
corrmat = np.abs(train.corr())
k = train.shape[1]
cols = corrmat.nlargest(k, 'Survived')['Survived'].index
print(cols,corrmat)

Index(['Survived', 'Sex', 'Pclass', 'Fare', 'Child', 'Parch', 'SibSp', 'Age',
       'Family'],
      dtype='object')             Pclass       Sex       Age     SibSp     Parch      Fare  \
Pclass    1.000000  0.131900  0.416872  0.083081  0.018443  0.536338   
Sex       0.131900  1.000000  0.099553  0.114631  0.245489  0.146923   
Age       0.416872  0.099553  1.000000  0.105891  0.001652  0.257809   
SibSp     0.083081  0.114631  0.105891  1.000000  0.414838  0.247550   
Parch     0.018443  0.245489  0.001652  0.414838  1.000000  0.233408   
Fare      0.536338  0.146923  0.257809  0.247550  0.233408  1.000000   
Family    0.065997  0.200988  0.073198  0.890712  0.783111  0.291295   
Child     0.121920  0.111141  0.097685  0.352437  0.351481  0.060086   
Survived  0.338481  0.543351  0.032146  0.035322  0.081629  0.239116   

            Family     Child  Survived  
Pclass    0.065997  0.121920  0.338481  
Sex       0.200988  0.111141  0.543351  
Age       0.073198  0.097685  0.032146

In [116]:
for n in range(2,9):
    x = train[cols[1:n]].copy()
#    x.drop('Survived', axis=1,inplace = True)
    y = train['Survived']

    model = LogisticRegressionCV(cv = 10, fit_intercept= True)
    model.fit(x,y)
    print(model.score(x,y))
print(n)

0.7867564534231201
0.7867564534231201


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [105]:
model.coef_

array([[ 2.73169566, -1.10609526,  0.17280962,  1.22516945, -0.18441138,
        -0.39153879, -0.45667521]])

In [97]:
test_copy = test[cols[1:n]].copy()
predictions = model.predict(test_copy)

In [98]:
submission = pd.DataFrame({"PassengerId":PassengerId, "Survived":predictions})
submission.to_csv("submission.csv", index=False)

In [None]:
all_data.groupby(['Sex','Pclass'])['Age'].mean()

In [None]:
all_data.loc(all_data['Age']<16)

In [106]:
all_data

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Family,Child
0,3,0,-0.549345,1,0,-0.502981,1,False
1,1,1,0.661100,1,0,0.734529,1,False
2,3,1,-0.246734,0,0,-0.489936,0,False
3,1,1,0.434142,1,0,0.383118,1,False
4,3,0,0.434142,0,0,-0.487520,0,False
5,3,0,-0.322387,0,0,-0.479629,0,False
6,1,0,1.871545,0,0,0.359202,0,False
7,3,0,-2.062402,3,1,-0.235799,4,True
8,3,1,-0.171081,0,2,-0.427932,2,False
9,2,1,-1.154568,1,0,-0.061945,1,True


In [108]:
skew(all_data['Fare'])

4.364196021412905