In [52]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
sns.set(rc={'figure.figsize':(12,8)})

In [None]:
def confusion_matrix(data,actual_values,model):
        
        # Confusion matrix 
        
        # Parameters
        # ----------
        # data: data frame or array
            # data is a data frame formatted in the same way as your input data (without the actual values)
            # e.g. const, var1, var2, etc. Order is very important!
        # actual_values: data frame or array
            # These are the actual values from the test_data
            # In the case of a logistic regression, it should be a single column with 0s and 1s
            
        # model: a LogitResults object
            # this is the variable where you have the fitted model 
            # e.g. results_log in this course
        # ----------
        
        #Predict the values using the Logit model
        pred_values = model.predict(data)
        # Specify the bins 
        bins=np.array([0,0.5,1])
        # Create a histogram, where if values are between 0 and 0.5 tell will be considered 0
        # if they are between 0.5 and 1, they will be considered 1
        cm = np.histogram2d(actual_values, pred_values, bins=bins)[0]
        # Calculate the accuracy
        accuracy = (cm[0,0]+cm[1,1])/cm.sum()
        # Return the confusion matrix and 
        return cm, accuracy

In [53]:
data = pd.read_csv('train.csv')
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
col_to_include = ['Survived','Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
dataset = data.copy()
dataset = dataset[col_to_include]
dataset = dataset.dropna(subset=['Embarked'])

In [57]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(fill_value=np.nan,strategy='mean',verbose=1)
imputer.fit(np.reshape(dataset['Age'].values,(-1,1)))
dataset['Age'] = imputer.transform(np.reshape(dataset['Age'].values,(-1,1)))


SimpleImputer(fill_value=nan, verbose=1)

In [None]:
y = dataset['Survived']
x = dataset.drop(['Survived'],axis=1)
print(x.columns.values)

In [12]:
x = pd.get_dummies(x,columns=['Pclass','Sex','Embarked'],drop_first=True)
x.head()

In [15]:
x_train = x.copy()
y_train = y.copy()
x_train = sm.add_constant(x_train)

In [17]:
model = sm.Logit(y_train,x_train)
res = model.fit()
res.summary()

In [22]:
confusion_matrix(x_train,y_train,res)

(array([[477.,  72.],
        [102., 238.]]),
 0.8042744656917885)

# Generating Submission File

In [58]:
x_final = pd.read_csv('test.csv',index_col='PassengerId')

In [59]:
col_to_include = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x_final = x_final[col_to_include]

In [60]:
x_final.Age = imputer.transform(np.reshape(x_final.Age.values,(-1,1)))

In [61]:
x_final =  x_final.fillna(x_final.Fare.mean())
x_final = pd.get_dummies(x_final,columns=['Pclass','Sex','Embarked'],drop_first=True)
x_final.head()

Unnamed: 0_level_0,Age,SibSp,Parch,Fare,Pclass_2,Pclass_3,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,34.5,0,0,7.8292,0,1,1,1,0
893,47.0,1,0,7.0,0,1,0,0,1
894,62.0,0,0,9.6875,1,0,1,1,0
895,27.0,0,0,8.6625,0,1,1,0,1
896,22.0,1,1,12.2875,0,1,0,0,1


In [66]:
pred_values = res.predict(sm.add_constant(x_final))
pred_values = pred_values.apply(correct_values)

In [76]:
def correct_values(pred_values):
    if pred_values > 0.5:
        return 1
    else:
        return 0

In [70]:
result = pd.DataFrame({
                        'PassengerId': pred_values.index.values,
                        'Survived':pred_values.values
                        })
result = result.sort_values(by='PassengerId')

In [None]:
result.head()

In [80]:
result.to_csv('Prediction_3.csv',index=False)