<a href="https://colab.research.google.com/github/aubrin-s/AEOP-REAP/blob/master/Titanic_aubrin_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Titanic Problem

##Functions

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import make_scorer, accuracy_score, f1_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV,train_test_split
import warnings

warnings.filterwarnings("ignore")

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


def simplify_ages(df):
    #df.Age = df.Age.fillna(df.Age.mean())
    df.Age = df.Age.fillna(-0.5)
    bins = (-1, 0, 5, 12, 18, 25, 35, 60, 120)
    group_names = ['Unknown', 'Baby', 'Child', 'Teenager', 'Student', 'Young Adult', 'Adult', 'Senior']
    categories = pd.cut(df.Age, bins, labels=group_names)
    df.Age = categories
    return df

def simplify_cabins(df):
    df.Cabin = df.Cabin.fillna('N')
    df.Cabin = df.Cabin.apply(lambda x: x[0])
    return df

def simplify_fares(df):
    df.Fare = df.Fare.fillna(-0.5)
    bins = (-1, 0, 8, 15, 31, 1000)
    group_names = ['Unknown', '1_quartile', '2_quartile', '3_quartile', '4_quartile']
    categories = pd.cut(df.Fare, bins, labels=group_names)
    df.Fare = categories
    return df

def format_name(df):
    df['Lname'] = df.Name.apply(lambda x: x.split(' ')[0])
    df['NamePrefix'] = df.Name.apply(lambda x: x.split(' ')[1])
    return df    
    
def drop_features(df):
    return df.drop(['Ticket', 'Name', 'Embarked'], axis=1)

def transform_features(df):
    df = simplify_ages(df)
    df = simplify_cabins(df)
    df = simplify_fares(df)
    df = format_name(df)
    df = drop_features(df)
    return df

train = transform_features(train)
test = transform_features(test)

def encode_features(df_train, df_test):
    features = ['Fare', 'Cabin', 'Age', 'Sex', 'Lname', 'NamePrefix']
    df_combined = pd.concat([df_train[features], df_test[features]])
    
    for feature in features:
        le = preprocessing.LabelEncoder()
        le = le.fit(df_combined[feature])
        df_train[feature] = le.transform(df_train[feature])
        df_test[feature] = le.transform(df_test[feature])
    return df_train, df_test

# ACC
def accuracy(clf,X_val,y_val):
  acc_random_forest = round(clf.score(X_val, y_val) * 100, 2)
  print ("Validation Accuracy: " + str(acc_random_forest) + '%')

# Recall
def recall(y_val,y_pred_val):
    recall = round(recall_score(y_val, y_pred_val) *100, 2)
    print("Validation Recall:" + str(recall) + '%')

# Precision
def precision(y_val,y_pred_val):
    precision = round(precision_score(y_val, y_pred_val) *100, 2)
    print("Validation Precision:" + str(precision) + '%')
# F1
def f1(y_val,y_pred_val):
    f1 = round(f1_score(y_val, y_pred_val) *100, 2)
    print("Validation F1:" + str(f1) + '%')

train, test = encode_features(train, test)
train.head()


X_all = train.drop(['Survived','PassengerId'], axis=1)
y_all = train['Survived']
X_test = test.drop("PassengerId", axis=1).copy()
num_test = 0.20
X_train, X_val, y_train, y_val = train_test_split(X_all, y_all, test_size=num_test, random_state=23)


#ids = test['PassengerId']
#predictions = clf.predict(test.drop('PassengerId', axis=1))

#output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
#output.to_csv('titanic-predictions.csv', index = False)

##RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(X_train, y_train)
y_pred_val = clf.predict(X_val)
# accuarcy
accuracy(clf,X_val,y_val)
# recall
recall(y_val,y_pred_val)
# Precision
precision(y_val,y_pred_val)
# F1 
f1(y_val, y_pred_val)

Validation Accuracy: 81.56%
Validation Recall:70.31%
Validation Precision:76.27%
Validation F1:73.17%


##GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
clf1= GradientBoostingClassifier

clf1 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
     max_depth=1, random_state=0).fit(X_train, y_train)
y_pred_gb = clf1.predict(X_val)

#clf.score(X_val, y_val) #same thing as accuracy

accuracy(clf1,X_val,y_val)
recall(y_val,y_pred_gb)
precision(y_val,y_pred_gb)
f1(y_val, y_pred_gb)

Validation Accuracy: 82.68%
Validation Recall:73.44%
Validation Precision:77.05%
Validation F1:75.2%


##DecisionTreeClassifier

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred_dtc = clf.predict(X_val)

accuracy(clf,X_val,y_val)
recall(y_val,y_pred_dtc)
precision(y_val,y_pred_dtc)
f1(y_val, y_pred_dtc)


Validation Accuracy: 76.54%
Validation Recall:73.44%
Validation Precision:65.28%
Validation F1:69.12%


##NearestCentroid

In [None]:
from sklearn.neighbors import NearestCentroid
nc = NearestCentroid()
nc.fit(X_train, y_train)
NearestCentroid()
y_pred_nc= nc.predict(X_val)

accuracy(nc,X_val,y_val)
recall(y_val,y_pred_nc)
precision(y_val,y_pred_nc)
f1(y_val, y_pred_nc)

Validation Accuracy: 49.72%
Validation Recall:43.75%
Validation Precision:34.15%
Validation F1:38.36%


##MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(5, 2), random_state=1)
clf.fit(X_train, y_train)
y_pred_nn = clf.predict(X_val)

accuracy(clf,X_val,y_val)
recall(y_val,y_pred_nn)
precision(y_val,y_pred_nn)
f1(y_val, y_pred_nn)

##GaussianNB

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.datasets import load_iris
gnb = GaussianNB()
y_pred_gnb = gnb.fit(X_train, y_train).predict(X_val)

accuracy(gnb,X_val,y_val)
recall(y_val,y_pred_gnb)
precision(y_val,y_pred_gnb)
f1(y_val, y_pred_gnb)

Validation Accuracy: 77.65%
Validation Recall:76.56%
Validation Precision:66.22%
Validation F1:71.01%


##Creating Submissions

In [None]:
import pandas as pd

y_pred_test = clf1.predict(X_test)
submission = pd.DataFrame({"PassengerId": test["PassengerId"], "Survived": y_pred_test})
submission.to_csv('titanicresult.csv', index=False)