In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score

In [3]:
train = pd.read_csv('titanic_train.csv')
test = pd.read_csv('titanic_test.csv')
submission = pd.read_csv('gender_submission.csv')
test = pd.merge(test, submission, on=['PassengerId', 'PassengerId'])
test['IS_VALIDATION'] = 1
train['IS_VALIDATION'] = 0
all_data=pd.concat([train,test],sort=True,copy=True)
t = all_data[all_data.IS_VALIDATION==1]

In [4]:
all_data_aux =  pd.read_csv('Call_Data.csv', nrows=20000)
#all_data_aux.head() 
t = all_data_aux[
    (all_data_aux['Event Clearance Description']=='ASSISTANCE RENDERED') |
    (all_data_aux['Event Clearance Description']=='UNABLE TO LOCATE INCIDENT OR COMPLAINANT') |
    (all_data_aux['Event Clearance Description']=='ORAL WARNING GIVEN') |
    (all_data_aux['Event Clearance Description']=='PROBLEM SOLVING PROJECT') |
    (all_data_aux['Event Clearance Description']=='REPORT WRITTEN (NO ARREST)')
]
t.count()
t.to_csv(path_or_buf='Call_Data_Clustering.csv')

In [5]:
t.to_csv(path_or_buf='test.csv')

In [6]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,IS_VALIDATION
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0


In [7]:
train['Ticket'] = train['Ticket'].apply(lambda x: len(x))

In [8]:
#train[train.Ticket == 6].describe()
#train[train.Ticket == 3].describe()
#train.groupby(['Ticket', 'Pclass'], as_index=False).mean()
train = pd.read_csv('titanic_train.csv')
train['Ticket'] = train['Ticket'].apply(lambda x: len(x))
#d = train.groupby(['Ticket', 'Pclass'], as_index=False).std()
aux = train.groupby(['Ticket', 'Pclass']).agg('std')['Fare']
aux = aux.to_frame().reset_index()
#aux = train.groupby(['Ticket', 'Pclass'])['Fare'].std()
#print(aux.head(100))
#a = aux[["Ticket", "Pclass"]].get_group("Fare")
aux.head(100)
#value = aux[(aux.Ticket == 6) & (aux.Pclass==3)]['Fare']
#print(value)

Unnamed: 0,Ticket,Pclass,Fare
0,3,1,14.849242
1,4,1,
2,4,2,0.0
3,4,3,13.11641
4,5,1,59.847589
5,5,2,7.774127
6,5,3,4.606832
7,6,1,40.80365
8,6,2,11.308857
9,6,3,6.977589


In [9]:
aux[(aux.Ticket == 6) & (aux.Pclass==3)]

Unnamed: 0,Ticket,Pclass,Fare
9,6,3,6.977589


In [10]:
#sns.set_style('whitegrid')
#sns.countplot(x='Survived',hue='Ticket',data=train,palette='RdBu_r')

In [11]:
class MyTransformer():
    '''A template for a custom transformer.'''

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    def _one_hot_encoding(self, train):
        sex = pd.get_dummies(train['Sex'],prefix='sex', drop_first=True)
        embark = pd.get_dummies(train['Embarked'],prefix='embarked', drop_first=True)
        pclass = pd.get_dummies(train['Pclass'],prefix='class', drop_first=True)
        ticket = pd.get_dummies(train['Ticket'], prefix='ticket', drop_first=True)
        cabin = pd.get_dummies(train['Cabin'], prefix='cabin', drop_first=True)
        prefix_name = pd.get_dummies(train['Name'], prefix='prefix_name', drop_first=True)
        return pd.concat([train, sex, embark, pclass, ticket, cabin, prefix_name],axis=1)

    def _impute_age(self, cols):
        age = cols[0]
        pclass = cols[1]

        if pd.isnull(age):

            if pclass == 1:
                return 37

            elif pclass == 2:
                return 29

            else:
                return 24
        else:
            return age
       
    def _impute_values(self, train): 
        train['Age'] = train[['Age','Pclass']].apply(self._impute_age,axis=1)
        train['Cabin'] = train['Cabin'].apply(lambda x: 0 if x is None else 1)
        train['Ticket'] = train['Ticket'].apply(lambda x: len(x))
        train['Ticket'] = train['Ticket'].apply(lambda x: 1 if x < 6 else 0)
        #aux = train.groupby(['Ticket', 'Pclass']).agg('std')['Fare']
        #aux = aux.to_frame().reset_index()
        #train['Ticket'] = train[['Ticket','Pclass']].apply(self._set_ticket, args=(aux,),axis=1)
        train['Name'] = train[['Name']].apply(self._impute_name,axis=1)
        
    def _set_ticket(self, cols, aux):
        ticket = cols[0]
        pclass = cols[1]
        value = aux[(aux.Ticket == ticket) & (aux.Pclass==pclass)]['Fare'].values[0]
        print(value)
        if value is None:
            value = 0
        return value
    
    def _impute_name(self, cols):
        pattern = re.search("\\,(.+?)\\.", cols[0])
        found='Other'
        if pattern:
            found = pattern.group(1)
        return found  

    def transform(self, train):
        cols = ['Sex','Embarked','Name','Cabin','Ticket', 'PassengerId', 'Pclass']
        self._impute_values(train)
        train.dropna(inplace=True)
        train = self._one_hot_encoding(train)   
        train.drop(cols,axis=1,inplace=True)
        return train

In [12]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('test', MyTransformer()),
])
pipe = pipeline.fit(all_data)
data_prepared = pipe.transform(all_data)
train = data_prepared[data_prepared.IS_VALIDATION==0]
validation = data_prepared[data_prepared.IS_VALIDATION==1]

In [13]:
#train['Ticket'].isna().sum()
#train['Ticket'].fillna(train['Age'].mean())
#train['Ticket'].isnull().any()

In [14]:
pd.set_option('max_columns',100)
train.count()

Age                          889
Fare                         889
IS_VALIDATION                889
Parch                        889
SibSp                        889
Survived                     889
sex_male                     889
embarked_Q                   889
embarked_S                   889
class_2                      889
class_3                      889
ticket_1                     889
prefix_name_ Col             889
prefix_name_ Don             889
prefix_name_ Dona            889
prefix_name_ Dr              889
prefix_name_ Jonkheer        889
prefix_name_ Lady            889
prefix_name_ Major           889
prefix_name_ Master          889
prefix_name_ Miss            889
prefix_name_ Mlle            889
prefix_name_ Mme             889
prefix_name_ Mr              889
prefix_name_ Mrs             889
prefix_name_ Ms              889
prefix_name_ Rev             889
prefix_name_ Sir             889
prefix_name_ the Countess    889
dtype: int64

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
cols_to_drop = ['Survived','IS_VALIDATION']
X_train, X_test, y_train, y_test = train_test_split(train.drop(cols_to_drop,axis=1), 
                                                    train['Survived'], test_size=0.30, 
                                                    random_state=101)

## Training and Predicting

In [38]:
from sklearn.neighbors import KNeighborsClassifier

k_range = list(range(3, 5))
param_grid = dict(n_neighbors = k_range)

mod_knn = KNeighborsClassifier()
grid_knn = GridSearchCV(estimator = mod_knn, param_grid = param_grid, cv = 10, scoring = 'accuracy') 

grid_knn.fit(X_train, y_train)
knn = grid_knn.best_estimator_

In [39]:
predictions = knn.predict(X_test)
print(classification_report(y_test,predictions))
#print('AUC: %.3f' % auc)

             precision    recall  f1-score   support

          0       0.77      0.75      0.76       163
          1       0.62      0.64      0.63       104

avg / total       0.71      0.71      0.71       267



In [17]:
from sklearn.linear_model import LogisticRegression

In [18]:
parameters = {'C':[1.0, 10.0, 50.0, 100,1000]}
logmodel = LogisticRegression()
clf = GridSearchCV(logmodel, parameters, cv=10, scoring='roc_auc')
clf.fit(X_train, y_train)
regr_log = clf.best_estimator_
#regr_log = LogisticRegression()
#regr_log.fit(X_train, y_train)
predictions = regr_log.predict(X_test)
probs = regr_log.predict_proba(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)

AUC: 0.882


In [19]:
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.84      0.94      0.89       163
          1       0.89      0.72      0.80       104

avg / total       0.86      0.86      0.85       267



In [23]:
#regr = RandomForestClassifier(random_state=101, criterion='entropy', max_depth=10)
#regr.fit(X_train,y_train)

parameters = {'min_samples_leaf':[2,5,7], 'min_samples_split':[2,5,7], 'max_depth':[2,6,10], 'random_state': [10]}
dtr = RandomForestClassifier()
clf = GridSearchCV(dtr, parameters, cv=10, scoring='roc_auc')
regr = clf.fit(X_train, y_train)
regr = clf.best_estimator_

In [24]:
print(regr.feature_importances_)

[0.13677158 0.17516877 0.02536718 0.02688175 0.15479542 0.0108864
 0.02805565 0.01988886 0.07258042 0.03003314 0.         0.
 0.         0.00070416 0.         0.         0.         0.01464365
 0.07346537 0.00122116 0.         0.16022708 0.06682615 0.
 0.00248326 0.         0.        ]


In [25]:
predictions = regr.predict(X_test)

In [26]:
probs = regr.predict_proba(X_test)
probs = probs[:, 1]
auc = roc_auc_score(y_test, probs)
print('AUC: %.3f' % auc)
print(classification_report(y_test,predictions))

AUC: 0.873
             precision    recall  f1-score   support

          0       0.84      0.91      0.88       163
          1       0.84      0.73      0.78       104

avg / total       0.84      0.84      0.84       267



Let's move on to evaluate our model!

## Evaluation

We can check precision,recall,f1-score using classification report!

In [27]:
train.drop('Survived',axis=1).columns

Index(['Age', 'Fare', 'IS_VALIDATION', 'Parch', 'SibSp', 'sex_male',
       'embarked_Q', 'embarked_S', 'class_2', 'class_3', 'ticket_1',
       'prefix_name_ Col', 'prefix_name_ Don', 'prefix_name_ Dona',
       'prefix_name_ Dr', 'prefix_name_ Jonkheer', 'prefix_name_ Lady',
       'prefix_name_ Major', 'prefix_name_ Master', 'prefix_name_ Miss',
       'prefix_name_ Mlle', 'prefix_name_ Mme', 'prefix_name_ Mr',
       'prefix_name_ Mrs', 'prefix_name_ Ms', 'prefix_name_ Rev',
       'prefix_name_ Sir', 'prefix_name_ the Countess'],
      dtype='object')

In [28]:
from sklearn.metrics import confusion_matrix

In [29]:
confusion_matrix(y_test, predictions, labels=[0,1])

array([[149,  14],
       [ 28,  76]], dtype=int64)

In [30]:
a = regr_log.coef_[0,:]
print(a)
cols = train.drop(cols_to_drop,axis=1).columns
#print(cols.reindex())
coef_dict = {}
coefficients = pd.DataFrame({"Feature":cols,"Coefficients":a})
for coef, feat in zip(regr_log.coef_[0,:],cols):
    coef_dict[feat] = coef
print(coef_dict)

[-0.02500819  0.00608964 -0.25981567 -0.4541372  -0.96565789  0.10358915
 -0.0969101  -0.54595169 -1.53991341  0.69764067  0.31793828 -0.41768879
  0.         -0.35076937 -0.51314723  0.201589    0.07127148  1.66155694
  0.5586752   0.25907875  0.         -1.11757437  1.13691794  0.28641269
 -0.84498078  0.48029748  0.16044337]
{'prefix_name_ Sir': 0.4802974849513082, 'sex_male': -0.9656578942385765, 'Parch': -0.2598156694215192, 'prefix_name_ Mrs': 1.1369179425349234, 'embarked_S': -0.0969100976124486, 'SibSp': -0.45413719914088746, 'prefix_name_ Master': 1.661556936899081, 'prefix_name_ Jonkheer': -0.5131472328722951, 'prefix_name_ Lady': 0.2015889997662359, 'prefix_name_ Miss': 0.5586751958404883, 'prefix_name_ Don': -0.4176887921974983, 'class_2': -0.5459516935411821, 'Fare': 0.006089637011921782, 'class_3': -1.5399134067944358, 'prefix_name_ Ms': 0.2864126882600973, 'prefix_name_ Col': 0.31793828430017024, 'prefix_name_ Mr': -1.1175743735530472, 'prefix_name_ Dona': 0.0, 'prefix_n

In [31]:
train.head()

Unnamed: 0,Age,Fare,IS_VALIDATION,Parch,SibSp,Survived,sex_male,embarked_Q,embarked_S,class_2,class_3,ticket_1,prefix_name_ Col,prefix_name_ Don,prefix_name_ Dona,prefix_name_ Dr,prefix_name_ Jonkheer,prefix_name_ Lady,prefix_name_ Major,prefix_name_ Master,prefix_name_ Miss,prefix_name_ Mlle,prefix_name_ Mme,prefix_name_ Mr,prefix_name_ Mrs,prefix_name_ Ms,prefix_name_ Rev,prefix_name_ Sir,prefix_name_ the Countess
0,22.0,7.25,0,0,1,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,38.0,71.2833,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,26.0,7.925,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
3,35.0,53.1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,35.0,8.05,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


Not so bad! You might want to explore other feature engineering and the other titanic_text.csv file, some suggestions for feature engineering:

* Try grabbing the Title (Dr.,Mr.,Mrs,etc..) from the name as a feature
* Maybe the Cabin letter could be a feature
* Is there any info you can get from the ticket?

## Great Job!

In [32]:
#submission[submission.PassengerId == 892].head()
y_val = validation.Survived
validation_features = validation.drop(cols_to_drop,axis=1)
#validation_features.head()
predictions = regr.predict(validation_features)
print('RF', classification_report(y_val,predictions))
print('roc_curve', roc_curve(y_val,predictions))
print(confusion_matrix(y_val, predictions, labels=[0,1]))
# calculate AUC
probs = regr.predict_proba(validation_features)
probs = probs[:, 1]
auc = roc_auc_score(y_val, probs)
print('AUC: %.3f' % auc)

RF              precision    recall  f1-score   support

          0       0.94      0.90      0.92       265
          1       0.84      0.89      0.87       152

avg / total       0.90      0.90      0.90       417

roc_curve (array([0.        , 0.09811321, 1.        ]), array([0.        , 0.89473684, 1.        ]), array([2, 1, 0], dtype=int64))
[[239  26]
 [ 16 136]]
AUC: 0.967


In [33]:
predictions = regr_log.predict(validation_features)
print('RL', classification_report(y_val,predictions))
print('roc_curve', roc_curve(y_val,predictions))
print(confusion_matrix(y_val, predictions, labels=[0,1]))
# calculate AUC
probs = regr_log.predict_proba(validation_features)
probs = probs[:, 1]
auc = roc_auc_score(y_val, probs)
print('AUC: %.3f' % auc)

RL              precision    recall  f1-score   support

          0       0.95      0.92      0.94       265
          1       0.87      0.92      0.89       152

avg / total       0.92      0.92      0.92       417

roc_curve (array([0.        , 0.07924528, 1.        ]), array([0.        , 0.92105263, 1.        ]), array([2, 1, 0], dtype=int64))
[[244  21]
 [ 12 140]]
AUC: 0.952


In [1683]:
'''from sklearn.ensemble import ExtraTreesClassifier
# load the iris datasets
dataset = datasets.load_iris()
# fit an Extra Trees model to the data
model = ExtraTreesClassifier()
model.fit(dataset.data, dataset.target)
# display the relative importance of each attribute
print(model.feature_importances_)
predict_proba'''

'from sklearn.ensemble import ExtraTreesClassifier\n# load the iris datasets\ndataset = datasets.load_iris()\n# fit an Extra Trees model to the data\nmodel = ExtraTreesClassifier()\nmodel.fit(dataset.data, dataset.target)\n# display the relative importance of each attribute\nprint(model.feature_importances_)\npredict_proba'

In [None]:
# plotting them against each other
def plot_roc_curve(false_positive_rate, true_positive_rate, label=None):
    plt.plot(false_positive_rate, true_positive_rate, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'r', linewidth=4)
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate (FPR)', fontsize=16)
    plt.ylabel('True Positive Rate (TPR)', fontsize=16)

plt.figure(figsize=(14, 7))
plot_roc_curve(false_positive_rate, true_positive_rate)
plt.show()
#https://www.kaggle.com/niklasdonges/end-to-end-project-with-python