# **Importing Librarys and Datasets**

In [200]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedKFold
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [201]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# **Preparing the datasets**

In [202]:
def sex_to_binary(value):
  if value == 'female':
    return 1
  else:
    return 0

train['Sex'] = train['Sex'].map(sex_to_binary)
test['Sex'] = test['Sex'].map(sex_to_binary)

In [203]:
variables = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare']

x = train[variables].fillna(-1)
y = train['Survived']

# **Cross-validation - RandomForest**

In [204]:
result = []

kf = RepeatedKFold(n_splits=2, n_repeats=10, random_state=10)
for train_line, test_line in kf.split(x):
  print("Train:", train_line.shape[0])
  print("Test:", test_line.shape[0])

  x_train, x_test = x.iloc[train_line], x.iloc[test_line]
  y_train, y_test = y.iloc[train_line], y.iloc[test_line]

  model = RandomForestClassifier(n_estimators=100, n_jobs=-1, random_state=0,)
  model.fit(x_train, y_train)

  Prediction = model.predict(x_test)
  
  accuracy = np.mean(y_test == Prediction)
  result.append(accuracy)

  print("Accuracy:", accuracy)
  print()

Train: 445
Test: 446
Accuracy: 0.7869955156950673

Train: 446
Test: 445
Accuracy: 0.7797752808988764

Train: 445
Test: 446
Accuracy: 0.827354260089686

Train: 446
Test: 445
Accuracy: 0.8179775280898877

Train: 445
Test: 446
Accuracy: 0.7847533632286996

Train: 446
Test: 445
Accuracy: 0.7842696629213484

Train: 445
Test: 446
Accuracy: 0.8161434977578476

Train: 446
Test: 445
Accuracy: 0.7842696629213484

Train: 445
Test: 446
Accuracy: 0.8004484304932735

Train: 446
Test: 445
Accuracy: 0.8

Train: 445
Test: 446
Accuracy: 0.8183856502242153

Train: 446
Test: 445
Accuracy: 0.802247191011236

Train: 445
Test: 446
Accuracy: 0.8116591928251121

Train: 446
Test: 445
Accuracy: 0.8067415730337079

Train: 445
Test: 446
Accuracy: 0.820627802690583

Train: 446
Test: 445
Accuracy: 0.7887640449438202

Train: 445
Test: 446
Accuracy: 0.8385650224215246

Train: 446
Test: 445
Accuracy: 0.8044943820224719

Train: 445
Test: 446
Accuracy: 0.7982062780269058

Train: 446
Test: 445
Accuracy: 0.8112359550561797

In [205]:
np.mean(result)

0.8041457147175896

# **Analysis**

In [206]:
x_test_check = train.iloc[test_line].copy()
x_test_check['Prediction'] = Prediction
x_test_check.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Prediction
0,1,0,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.25,,S,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,1,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.925,,S,0
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1,C123,S,1
4,5,0,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.05,,S,0


In [207]:
errors = x_test_check[x_test_check['Survived'] != x_test_check['Prediction']]
errors = errors[['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 
           'Ticket', 'Fare', 'Cabin', 'Embarked', 'Prediction', 'Survived']]

In [208]:
womens = errors[errors['Sex'] == 1]
mans = errors[errors['Sex'] == 0]

In [209]:
womens.sort_values('Survived')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Prediction,Survived
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.45,,S,1,0
501,502,3,"Canavan, Miss. Mary",1,21.0,0,0,364846,7.75,,Q,1,0
498,499,1,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",1,25.0,1,2,113781,151.55,C22 C26,S,1,0
882,883,3,"Dahlberg, Miss. Gerda Ulrika",1,22.0,0,0,7552,10.5167,,S,1,0
474,475,3,"Strandberg, Miss. Ida Sofia",1,22.0,0,0,7553,9.8375,,S,1,0
419,420,3,"Van Impe, Miss. Catharina",1,10.0,0,2,345773,24.15,,S,1,0
357,358,2,"Funk, Miss. Annie Clemmer",1,38.0,0,0,237671,13.0,,S,1,0
254,255,3,"Rosblom, Mrs. Viktor (Helena Wilhelmina)",1,41.0,0,2,370129,20.2125,,S,1,0
503,504,3,"Laitinen, Miss. Kristina Sofia",1,37.0,0,0,4135,9.5875,,S,1,0
654,655,3,"Hegarty, Miss. Hanora ""Nora""",1,18.0,0,0,365226,6.75,,Q,1,0


In [210]:
mans.sort_values('Survived')

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Prediction,Survived
238,239,2,"Pengelly, Mr. Frederick William",0,19.0,0,0,28665,10.5,,S,1,0
442,443,3,"Petterson, Mr. Johan Emil",0,25.0,1,0,347076,7.775,,S,1,0
422,423,3,"Zimmerman, Mr. Leo",0,29.0,0,0,315082,7.875,,S,1,0
373,374,1,"Ringhini, Mr. Sante",0,22.0,0,0,PC 17760,135.6333,,C,1,0
371,372,3,"Wiklund, Mr. Jakob Alfred",0,18.0,1,0,3101267,6.4958,,S,1,0
698,699,1,"Thayer, Mr. John Borland",0,49.0,1,1,17421,110.8833,C68,C,1,0
262,263,1,"Taussig, Mr. Emil",0,52.0,1,1,110413,79.65,E67,S,1,0
231,232,3,"Larsson, Mr. Bengt Edvin",0,29.0,0,0,347067,7.775,,S,1,0
214,215,3,"Kiernan, Mr. Philip",0,,1,0,367229,7.75,,Q,1,0
721,722,3,"Jensen, Mr. Svend Lauritz",0,17.0,1,0,350048,7.0542,,S,1,0


# **New Variables**

In [211]:
train['Embarked_S'] = (train['Embarked'] == 'S').astype(int)
train['Embarked_C'] = (train['Embarked'] == 'C').astype(int)

train['Cabin_Null'] = train['Cabin'].isnull().astype(int)

train['Name_contain_Miss'] = train['Name'].str.contains("Miss").astype(int)
train['Name_contain_Mrs'] = train['Name'].str.contains("Mrs").astype(int)

train['Name_contain_Master'] = train['Name'].str.contains("Master").astype(int)
train['Name_contain_Col'] = train['Name'].str.contains("Col").astype(int)
train['Name_contain_Major'] = train['Name'].str.contains("Major").astype(int)
train['Name_contain_Mr'] = train['Name'].str.contains("Mr").astype(int)

In [212]:
variables = ['Sex', 'Age', 'Pclass', 'SibSp', 'Parch', 'Fare', 'Embarked_S', 'Embarked_C', 'Cabin_Null',
             'Name_contain_Miss', 'Name_contain_Mrs',
             'Name_contain_Master', 'Name_contain_Col', 'Name_contain_Major', 'Name_contain_Mr']

X = train[variables].fillna(-1)
y = train['Survived']

# **Cross-validation - LogisticRegression**

In [213]:
result2 = []

kf = RepeatedKFold(n_splits=2, n_repeats=10, random_state=10)
for train_line, test_line in kf.split(x):
  print("Train:", train_line.shape[0])
  print("Test:", test_line.shape[0])

  x_train, x_test = x.iloc[train_line], x.iloc[test_line]
  y_train, y_test = y.iloc[train_line], y.iloc[test_line]

  model = LogisticRegression()
  model.fit(x_train, y_train)

  Prediction = model.predict(x_test)
  
  accuracy = np.mean(y_test == Prediction)
  result2.append(accuracy)

  print("Accuracy:", accuracy)
  print()

Train: 445
Test: 446
Accuracy: 0.7914798206278026

Train: 446
Test: 445
Accuracy: 0.7707865168539326

Train: 445
Test: 446
Accuracy: 0.7959641255605381

Train: 446
Test: 445
Accuracy: 0.7797752808988764

Train: 445
Test: 446
Accuracy: 0.7713004484304933

Train: 446
Test: 445
Accuracy: 0.7932584269662921

Train: 445
Test: 446
Accuracy: 0.7937219730941704

Train: 446
Test: 445
Accuracy: 0.7955056179775281

Train: 445
Test: 446
Accuracy: 0.7937219730941704

Train: 446
Test: 445
Accuracy: 0.7865168539325843

Train: 445
Test: 446
Accuracy: 0.7869955156950673

Train: 446
Test: 445
Accuracy: 0.7910112359550562

Train: 445
Test: 446
Accuracy: 0.804932735426009

Train: 446
Test: 445
Accuracy: 0.7797752808988764

Train: 445
Test: 446
Accuracy: 0.7847533632286996

Train: 446
Test: 445
Accuracy: 0.7662921348314606

Train: 445
Test: 446
Accuracy: 0.804932735426009

Train: 446
Test: 445
Accuracy: 0.7752808988764045

Train: 445
Test: 446
Accuracy: 0.7937219730941704

Train: 446
Test: 445
Accuracy: 0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [195]:
np.mean(result2)

0.7874245477905981

In [None]:
test['Embarked_S'] = (test['Embarked'] == 'S').astype(int)
test['Embarked_C'] = (test['Embarked'] == 'C').astype(int)

test['Cabin_Null'] = (test['Cabin'].isnull()).astype(int)

test['Name_contain_Miss'] = test['Name'].str.contains("Miss").astype(int)
test['Name_contain_Mrs'] = test['Name'].str.contains("Mrs").astype(int)

test['Name_contain_Master'] = test['Name'].str.contains("Master").astype(int)
test['Name_contain_Col'] = test['Name'].str.contains("Col").astype(int)
test['Name_contain_Major'] = test['Name'].str.contains("Major").astype(int)
test['Name_contain_Mr'] = test['Name'].str.contains("Mr").astype(int)

model = LogisticRegression()
model.fit(X, y)

Prediction = model.predict(test[variaveis].fillna(-1))

# **CSV Submition**

In [None]:
submit = pd.Series(Prediction, index=test["PassengerId"], name="Survived")
submit.shape

(418,)

In [None]:
submit.to_csv("Model.csv", header=True)