In [114]:
import pandas as pd
import numpy as np
import random as rnd
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals

In [3]:
import time                                                
def timeit(method):
    def timed(*args, **kw):
        ts = time.time()
        result = method(*args, **kw)
        te = time.time()
        if 'log_time' in kw:
            name = kw.get('log_name', method.__name__.upper())
            kw['log_time'][name] = int((te - ts) * 1000)
        else:
            print('%r  %2.2f ms' % \
                  (method.__name__, (te - ts) * 1000))
        return result
    return timed

In [137]:
class Titanic(object):
    
    def OHE(self, df):
        df = pd.get_dummies(df)
        return df
        
    def xy_separator(self, df):
        X= df.iloc[:,1:]
        if 'Survived' in df.columns:
            y= df['Survived']
            return [X,y]
        else:
            return X
    
    def preprocess(self,path):
        train_df = pd.read_csv(path + '/train.csv')
        test_df = pd.read_csv(path + '/test.csv')
        combine = [train_df, test_df]

        train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)
        test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)
        combine = [train_df, test_df]

        for dataset in combine:
            dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\.', expand=False)

        pd.crosstab(train_df['Title'], train_df['Sex'])

        for dataset in combine:
            dataset['Title'] = dataset['Title'].replace(['Lady', 'Countess','Capt', 'Col',\
            'Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona'], 'Rare')

            dataset['Title'] = dataset['Title'].replace('Mlle', 'Miss')
            dataset['Title'] = dataset['Title'].replace('Ms', 'Miss')
            dataset['Title'] = dataset['Title'].replace('Mme', 'Mrs')

        train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

        title_mapping = {"Mr": 1, "Miss": 2, "Mrs": 3, "Master": 4, "Rare": 5}
        for dataset in combine:
            dataset['Title'] = dataset['Title'].map(title_mapping)
            dataset['Title'] = dataset['Title'].fillna(0)

        train_df = train_df.drop(['Name', 'PassengerId'], axis=1)
        test_df = test_df.drop(['Name'], axis=1)
        combine = [train_df, test_df]

        for dataset in combine:
            dataset['Sex'] = dataset['Sex'].map( {'female': 1, 'male': 0} ).astype(int)

        guess_ages = np.zeros((2,3))
        guess_ages

        for dataset in combine:
            for i in range(0, 2):
                for j in range(0, 3):
                    guess_df = dataset[(dataset['Sex'] == i) & \
                                          (dataset['Pclass'] == j+1)]['Age'].dropna()

                    # age_mean = guess_df.mean()
                    # age_std = guess_df.std()
                    # age_guess = rnd.uniform(age_mean - age_std, age_mean + age_std)

                    age_guess = guess_df.median()

                    # Convert random age float to nearest .5 age
                    guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5

            for i in range(0, 2):
                for j in range(0, 3):
                    dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                            'Age'] = guess_ages[i,j]

            dataset['Age'] = dataset['Age'].astype(int)

        for dataset in combine:
            dataset['FamilySize'] = dataset['SibSp'] + dataset['Parch'] + 1



        for dataset in combine:
            dataset['IsAlone'] = 0
            dataset.loc[dataset['FamilySize'] == 1, 'IsAlone'] = 1

        combine = [train_df, test_df]

        freq_port = train_df.Embarked.dropna().mode()[0]

        for dataset in combine:
            dataset['Embarked'] = dataset['Embarked'].fillna(freq_port)

        test_df['Fare'].fillna(test_df['Fare'].dropna().median(), inplace=True)

        test_df.head(10)
        return [train_df,test_df]
    
    
    def __init__(self, path, random_state=42):
        self.random_state = random_state
        self.train, self.test = self.preprocess(path)
        self.train = self.OHE(self.train)
        self.test = self.OHE(self.test)
        self.X_train, self.y_train = self.xy_separator(self.train)
    
    
    
    def classify(self, model = RandomForestClassifier(oob_score = True), grid_search = True): 
        self.model = model 
        X_train,X_test,y_train,y_test = train_test_split(self.X_train, self.y_train, test_size=0.2, random_state=self.random_state)
        self.model.fit(X_train, y_train)
        predictions = self.model.predict(X_test)
        self.accuracy = metrics.accuracy_score(y_test, predictions)
        self.confusion_matrix = metrics.confusion_matrix(y_test, predictions)
        return self
    
    def store_predictions(self, path = '/', model = RandomForestClassifier(oob_score = True), filename = 'predictions.csv'):
        self.X_test = self.xy_separator(self.test)
        predictions = self.model.predict(self.X_test)
        prediction_output = pd.DataFrame(predictions, index=self.test['PassengerId'],columns=['Survived'])
        prediction_output = prediction_output.reset_index()
        prediction_output.to_csv(path +'/'+ filename,index=False)
        return None

In [139]:
Titanic(path = '/Users/arunabhsingh/Desktop/greyatom/titanic_whynot/csv/', random_state = 0).classify().store_predictions(path = '/Users/arunabhsingh/Desktop')