In [15]:
import pandas as pd
from sklearn import preprocessing
from sklearn import tree
from sklearn import model_selection
import matplotlib.pyplot as plt
from sklearn import ensemble

  from numpy.core.umath_tests import inner1d


In [3]:
titanic_train = pd.read_csv("D:\\Data\\train.csv")
titanic_train.shape
titanic_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB


In [4]:
titanic_test = pd.read_csv('D:\\Data\\test.csv')
titanic_test.shape
titanic_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [5]:
titanic_test.Survived = None

In [6]:
#Let's excercise by concatinating both train and test data
#Concatenation is Bcoz to have same number of rows and columns so that our job will be easy
titanic = pd.concat([titanic_train, titanic_test])
titanic.shape
titanic.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
Age            1046 non-null float64
Cabin          295 non-null object
Embarked       1307 non-null object
Fare           1308 non-null float64
Name           1309 non-null object
Parch          1309 non-null int64
PassengerId    1309 non-null int64
Pclass         1309 non-null int64
Sex            1309 non-null object
SibSp          1309 non-null int64
Survived       891 non-null float64
Ticket         1309 non-null object
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [7]:
#Extract and create title column from name
def extract_title(name):
     return name.split(',')[1].split('.')[0].strip()
#The map(aFunction, aSequence) function applies a passed-in function to each item in an iterable object 
#and returns a list containing all the function call results.
titanic['Title'] = titanic['Name'].map(extract_title)

In [8]:
#Imputation work for missing data with default values
mean_imputer = preprocessing.Imputer() #By defalut parameter is mean and let it use default one.
mean_imputer.fit(titanic_train[['Age','Fare']]) 
#Age is missing in both train and test data.
#Fare is NOT missing in train data but missing test data. Since we are playing on tatanic union data, we are applying mean imputer on Fare as well..
titanic[['Age','Fare']] = mean_imputer.transform(titanic[['Age','Fare']])

In [9]:
#creaate categorical age column from age
#It's always a good practice to create functions so that the same can be applied on test data as well
def convert_age(age):
    if(age >= 0 and age <= 10): 
        return 'Child'
    elif(age <= 25): 
        return 'Young'
    elif(age <= 50): 
        return 'Middle'
    else: 
        return 'Old'
#Convert numerical Age column to categorical Age_Cat column
titanic['Age_Cat'] = titanic['Age'].map(convert_age)

In [10]:
#Create a new column FamilySize by combining SibSp and Parch and seee we get any additioanl pattern recognition than individual
titanic['FamilySize'] = titanic['SibSp'] +  titanic['Parch'] + 1
def convert_familysize(size):
    if(size == 1): 
        return 'Single'
    elif(size <=3): 
        return 'Small'
    elif(size <= 6): 
        return 'Medium'
    else: 
        return 'Large'
#Convert numerical FamilySize column to categorical FamilySize_Cat column
titanic['FamilySize_Cat'] = titanic['FamilySize'].map(convert_familysize)

In [11]:
#Now we got 3 new columns, Title, Age_Cat, FamilySize_Cat
#convert categorical columns to one-hot encoded columns including  newly created 3 categorical columns
#There is no other choice to convert categorical columns to get_dummies in Python
titanic1 = pd.get_dummies(titanic, columns=['Sex','Pclass','Embarked', 'Age_Cat', 'Title', 'FamilySize_Cat'])
titanic1.shape
titanic1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 44 columns):
Age                      1309 non-null float64
Cabin                    295 non-null object
Fare                     1309 non-null float64
Name                     1309 non-null object
Parch                    1309 non-null int64
PassengerId              1309 non-null int64
SibSp                    1309 non-null int64
Survived                 891 non-null float64
Ticket                   1309 non-null object
FamilySize               1309 non-null int64
Sex_female               1309 non-null uint8
Sex_male                 1309 non-null uint8
Pclass_1                 1309 non-null uint8
Pclass_2                 1309 non-null uint8
Pclass_3                 1309 non-null uint8
Embarked_C               1309 non-null uint8
Embarked_Q               1309 non-null uint8
Embarked_S               1309 non-null uint8
Age_Cat_Child            1309 non-null uint8
Age_Cat_Middle           1309 no

In [14]:
#Drop un-wanted columns for faster execution and create new set called titanic2
titanic2 = titanic1.drop(['PassengerId','Name','Age','Ticket','Cabin','Survived'], axis=1, inplace=False)
#See how may columns are there after 3 additional columns, one hot encoding and dropping
titanic2.shape 
titanic2.info()
#Splitting tain and test data
X_train = titanic2[0:titanic_train.shape[0]] #0 t0 891 records
X_train.shape
X_train.info()
y_train = titanic_train['Survived']

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 38 columns):
Fare                     1309 non-null float64
Parch                    1309 non-null int64
SibSp                    1309 non-null int64
FamilySize               1309 non-null int64
Sex_female               1309 non-null uint8
Sex_male                 1309 non-null uint8
Pclass_1                 1309 non-null uint8
Pclass_2                 1309 non-null uint8
Pclass_3                 1309 non-null uint8
Embarked_C               1309 non-null uint8
Embarked_Q               1309 non-null uint8
Embarked_S               1309 non-null uint8
Age_Cat_Child            1309 non-null uint8
Age_Cat_Middle           1309 non-null uint8
Age_Cat_Old              1309 non-null uint8
Age_Cat_Young            1309 non-null uint8
Title_Capt               1309 non-null uint8
Title_Col                1309 non-null uint8
Title_Don                1309 non-null uint8
Title_Dona               1309 non-nul

In [None]:
rf_estimator = ensemble.RandomForestClassifier(random_state=1)

rf_grid = {'n_estimators':[50],'max_features'=[5,6,7],'min_samples_split':[3,4,6]}
grid_rf_estimetor = model_selection.GridSearchCV(rf_estimator,rf_grid,cv=10,n_jobs=3)