In [187]:
# library imports
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from copy import copy

In [188]:
# data imports
train_original = pd.read_csv('C:/Users/Andrew Mark/Google Drive/Projects/Kaggle/titanic/train.csv')
test_original = pd.read_csv('C:/Users/Andrew Mark/Google Drive/Projects/Kaggle/titanic/test.csv')
answers = pd.read_csv('C:/Users/Andrew Mark/Google Drive/Projects/Kaggle/titanic/gender_submission.csv')

train = train_original
test = copy(test_original)
test = pd.DataFrame(test)

In [189]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [190]:
# data cleansing/pre-processing

In [191]:
ids = test['PassengerId'].unique()

In [192]:
f = train.append(test, ignore_index=True, sort=False)

In [193]:
f.drop(['Name','Ticket','Cabin'], axis = 1, inplace = True)

In [194]:
f = f.drop(['Survived'], axis=1)

In [195]:
#Filling missing values
def fill_missing_values(df):
    missing = df.isnull().sum()
    missing = missing[missing > 0]
    for column in list(missing.index):
        if df[column].dtype == 'object':
            df[column].fillna(df[column].value_counts().index[0], inplace=True)
        elif df[column].dtype == 'int64' or 'float64' or 'int16' or 'float16':
            df[column].fillna(df[column].median(), inplace=True)      

In [196]:
#Converting String to Int
def convert (df):
    # Find the columns of object type along with their column index
    object_cols = list(df.select_dtypes(exclude=[np.number]).columns)
    object_cols_ind = []
    for col in object_cols:
        object_cols_ind.append(df.columns.get_loc(col))

    # Encode the categorical columns with numbers    
    label_enc = LabelEncoder()
    for i in object_cols_ind:
        df.iloc[:,i] = label_enc.fit_transform(df.iloc[:,i])

In [197]:
fill_missing_values(f)

In [198]:
convert(f)

In [199]:
# create bins for Fare
f['Fare'] = pd.cut(f['Fare'], 1)
# create bins for Age
f['Age'] = pd.cut(f['Age'], 1)

In [200]:
# convert bins of NewAge and NewFare to int
f['Age'] = pd.get_dummies(f['Age'], columns = ['Age'], prefix=['Int'])

f['Fare'] = pd.get_dummies(f['Fare'], columns = ['Fare'], prefix=['Int'])

In [201]:
# New feature
f['Family_Size'] = f['SibSp'] + f['Parch'] + 1

In [202]:
f = f.dropna()

In [203]:
train_df = f[~f['PassengerId'].isin(ids)]

In [204]:
test_df = f[f['PassengerId'].isin(ids)]

In [205]:
# separate features and labels for train and test datasets
X_test = test_df.drop('PassengerId',1)
y_test = answers['Survived']

In [206]:
X_train = train_df.drop('PassengerId',1)
y_train = train['Survived']

In [207]:
from sklearn.preprocessing import MinMaxScaler
X_test = MinMaxScaler().fit_transform(X_test)
X_train = MinMaxScaler().fit_transform(X_train)

In [208]:
# training and prediction
import xgboost as XGBClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
lr_pred = logmodel.predict(X_test)

rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)

model = xgb.XGBClassifier (random_state=53, n_jobs=-1, learning_rate=0.05, 
                  n_estimators=100, max_depth=4)
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

gbc = GradientBoostingClassifier (random_state=53,n_estimators=100,learning_rate=0.05)
gbc.fit(X_train,y_train)
gbc_pred = gbc.predict(X_test)

print("Logistic Regression Accuracy: \t",metrics.accuracy_score(y_test, lr_pred))
print("Random Forest Accuracy: \t", metrics.accuracy_score(y_test, rf_pred))
print("XGBClassifier Accuracy: \t", metrics.accuracy_score(y_test, y_pred))
print("Gradient Boosting Accuracy: \t", metrics.accuracy_score(y_test, gbc_pred))
print("\n")

Logistic Regression Accuracy: 	 0.9904306220095693
Random Forest Accuracy: 	 0.8588516746411483
XGBClassifier Accuracy: 	 0.930622009569378
Gradient Boosting Accuracy: 	 0.9401913875598086




  if diff:


In [210]:
submit = pd.DataFrame({'PassengerId':test_original['PassengerId'], 'Survived':lr_pred})
print(submit.head(10))
submit.to_csv('C:/Users/Andrew Mark/Google Drive/Projects/Kaggle/titanic/submission.csv', index = False)

   PassengerId  Survived
0          892         0
1          893         1
2          894         0
3          895         0
4          896         1
5          897         0
6          898         1
7          899         0
8          900         1
9          901         0
