In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_data = pd.read_csv('train.csv')

In [3]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_data.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [6]:
'''
Name , Sex , Ticket , Cabin , Embarked are object types

Embarked means from where passenger boarded the ship

'''

train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [7]:
train_data['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [8]:
'''
Age is also a problem as almost 20% peoples donot have their age specified

80% people didn't have their cabin so we can make that value as None

Also Emarked has 2 null values we can replace it by 'S' as S is the most common location from where passenger boarded the ship
'''
train_data['Cabin'] = train_data['Cabin'].fillna('None')
train_data['Embarked'] = train_data['Embarked'].fillna('S')

In [9]:
train_data.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin            0
Embarked         0
dtype: int64

In [10]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [11]:
'''
Name will not be necessary for the model so insted we can simply give Title inplace of Name

for eg . Braud , Mr. Owen Harris is male so we can give his title as Mr.
'''
Title = []

def getTitle(df):
    for i in range(len(df)):
        if df['Sex'][i] == 'male':
            Title.append('Mr')
        else:
            if df['Age'][i] >=30:
                Title.append('Mrs')
            else:
                Title.append('Miss')

getTitle(train_data)

In [12]:
train_data['Title'] = Title

In [13]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,Mr


In [14]:
train_data['Title'].value_counts()

Mr      577
Miss    200
Mrs     114
Name: Title, dtype: int64

In [15]:
'''
Now we can drop Name column
'''
train_data = train_data.drop('Name',axis =1)

In [16]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,1,0,3,male,22.0,1,0,A/5 21171,7.25,,S,Mr
1,2,1,1,female,38.0,1,0,PC 17599,71.2833,C85,C,Mrs
2,3,1,3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,Miss
3,4,1,1,female,35.0,1,0,113803,53.1,C123,S,Mrs
4,5,0,3,male,35.0,0,0,373450,8.05,,S,Mr


In [17]:
'''
For Age We can replace NaN value by median of total age
'''
train_data['Age'].fillna(train_data['Age'].median(),inplace=True)

In [18]:
train_data.isna().sum()

PassengerId    0
Survived       0
Pclass         0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
Title          0
dtype: int64

In [19]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        891 non-null    object 
 10  Embarked     891 non-null    object 
 11  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [20]:
'''
Passenger Id is not necessary as it is just the count of passenger.
but it is needed for output
'''


'\nPassenger Id is not necessary as it is just the count of passenger.\nbut it is needed for output\n'

In [21]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        891 non-null    object 
 10  Embarked     891 non-null    object 
 11  Title        891 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [22]:
'''
Let's Check Which Feature is more important for our result

We can see that survivar chance mostly depends upon Fare and Pclass(class of passenger)
'''
train_data.corr()['Survived'].sort_values()

Pclass        -0.338481
Age           -0.064910
SibSp         -0.035322
PassengerId   -0.005007
Parch          0.081629
Fare           0.257307
Survived       1.000000
Name: Survived, dtype: float64

In [23]:
'''
Let's Create 3 more columns as dummies for Title as it is in str.
(Mr,Mrs,Miss)

and drop title column
'''
train_data = pd.get_dummies(train_data,columns=['Title'])

In [24]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          891 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Ticket       891 non-null    object 
 8   Fare         891 non-null    float64
 9   Cabin        891 non-null    object 
 10  Embarked     891 non-null    object 
 11  Title_Miss   891 non-null    uint8  
 12  Title_Mr     891 non-null    uint8  
 13  Title_Mrs    891 non-null    uint8  
dtypes: float64(2), int64(5), object(4), uint8(3)
memory usage: 79.3+ KB


In [25]:
'''
We can do the same for Embarked ans Sex column.
'''
train_data['Embarked'].value_counts()

S    646
C    168
Q     77
Name: Embarked, dtype: int64

In [26]:
train_data['Sex'].value_counts()

male      577
female    314
Name: Sex, dtype: int64

In [27]:
train_data = pd.get_dummies(train_data,columns=['Embarked','Sex'])

In [28]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Ticket       891 non-null    object 
 7   Fare         891 non-null    float64
 8   Cabin        891 non-null    object 
 9   Title_Miss   891 non-null    uint8  
 10  Title_Mr     891 non-null    uint8  
 11  Title_Mrs    891 non-null    uint8  
 12  Embarked_C   891 non-null    uint8  
 13  Embarked_Q   891 non-null    uint8  
 14  Embarked_S   891 non-null    uint8  
 15  Sex_female   891 non-null    uint8  
 16  Sex_male     891 non-null    uint8  
dtypes: float64(2), int64(5), object(2), uint8(8)
memory usage: 69.7+ KB


In [29]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Cabin,Title_Miss,Title_Mr,Title_Mrs,Embarked_C,Embarked_Q,Embarked_S,Sex_female,Sex_male
0,1,0,3,22.0,1,0,A/5 21171,7.25,,0,1,0,0,0,1,0,1
1,2,1,1,38.0,1,0,PC 17599,71.2833,C85,0,0,1,1,0,0,1,0
2,3,1,3,26.0,0,0,STON/O2. 3101282,7.925,,1,0,0,0,0,1,1,0
3,4,1,1,35.0,1,0,113803,53.1,C123,0,0,1,0,0,1,1,0
4,5,0,3,35.0,0,0,373450,8.05,,0,1,0,0,0,1,0,1


In [30]:
'''
Now Let's Look at cabin 

We can Convert all of these only into cabin with letters 'A' 'B' 'C' and so on 
'''

train_data['Cabin'].value_counts()

None           687
C23 C25 C27      4
G6               4
B96 B98          4
C22 C26          3
              ... 
E34              1
C7               1
C54              1
E36              1
C148             1
Name: Cabin, Length: 148, dtype: int64

In [31]:
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G']
deck =[]

for i in range(len(train_data["Cabin"])):
    for j in cabin_list :
        if j in train_data["Cabin"][i] :
            deck.append(j)
            break
    
    else:
        deck.append("None")

In [32]:
train_data['Deck'] = deck

In [33]:
train_data['Deck'].value_counts()

None    687
C        59
B        47
E        33
D        33
A        15
F        12
G         4
T         1
Name: Deck, dtype: int64

In [34]:
train_data.drop('Cabin',axis=1,inplace=True)

In [35]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Ticket       891 non-null    object 
 7   Fare         891 non-null    float64
 8   Title_Miss   891 non-null    uint8  
 9   Title_Mr     891 non-null    uint8  
 10  Title_Mrs    891 non-null    uint8  
 11  Embarked_C   891 non-null    uint8  
 12  Embarked_Q   891 non-null    uint8  
 13  Embarked_S   891 non-null    uint8  
 14  Sex_female   891 non-null    uint8  
 15  Sex_male     891 non-null    uint8  
 16  Deck         891 non-null    object 
dtypes: float64(2), int64(5), object(2), uint8(8)
memory usage: 69.7+ KB


In [36]:
'''
Now we can convert Deck also into dummies of int values
'''

train_data = pd.get_dummies(train_data,columns=['Deck'])

In [37]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 25 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Ticket       891 non-null    object 
 7   Fare         891 non-null    float64
 8   Title_Miss   891 non-null    uint8  
 9   Title_Mr     891 non-null    uint8  
 10  Title_Mrs    891 non-null    uint8  
 11  Embarked_C   891 non-null    uint8  
 12  Embarked_Q   891 non-null    uint8  
 13  Embarked_S   891 non-null    uint8  
 14  Sex_female   891 non-null    uint8  
 15  Sex_male     891 non-null    uint8  
 16  Deck_A       891 non-null    uint8  
 17  Deck_B       891 non-null    uint8  
 18  Deck_C       891 non-null    uint8  
 19  Deck_D  

In [38]:
train_data.drop('Deck_T',axis = 1,inplace= True)

In [39]:
train_data[train_data['Ticket'] == '347082']

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Title_Miss,Title_Mr,...,Sex_female,Sex_male,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_None
13,14,0,3,39.0,1,5,347082,31.275,0,1,...,0,1,0,0,0,0,0,0,0,1
119,120,0,3,2.0,4,2,347082,31.275,1,0,...,1,0,0,0,0,0,0,0,0,1
541,542,0,3,9.0,4,2,347082,31.275,1,0,...,1,0,0,0,0,0,0,0,0,1
542,543,0,3,11.0,4,2,347082,31.275,1,0,...,1,0,0,0,0,0,0,0,0,1
610,611,0,3,39.0,1,5,347082,31.275,0,0,...,1,0,0,0,0,0,0,0,0,1
813,814,0,3,6.0,4,2,347082,31.275,1,0,...,1,0,0,0,0,0,0,0,0,1
850,851,0,3,4.0,4,2,347082,31.275,0,1,...,0,1,0,0,0,0,0,0,0,1


In [40]:
'''
For Ticket we can't make it as dummies values as it is almost differnet for each passenger.
so we conver it into categorical values and then to numerical values.
'''
'''
for label, content in train_data.items():
    if pd.api.types.is_string_dtype(content):
        train_data[label] = content.astype("category").cat.as_ordered()
'''
# This code if we have multiple srting(object) columns and want to convert them to category. 

for label, content in train_data.items():
    if pd.api.types.is_string_dtype(content):
        train_data[label] = content.astype("category").cat.as_ordered()

In [41]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PassengerId  891 non-null    int64   
 1   Survived     891 non-null    int64   
 2   Pclass       891 non-null    int64   
 3   Age          891 non-null    float64 
 4   SibSp        891 non-null    int64   
 5   Parch        891 non-null    int64   
 6   Ticket       891 non-null    category
 7   Fare         891 non-null    float64 
 8   Title_Miss   891 non-null    uint8   
 9   Title_Mr     891 non-null    uint8   
 10  Title_Mrs    891 non-null    uint8   
 11  Embarked_C   891 non-null    uint8   
 12  Embarked_Q   891 non-null    uint8   
 13  Embarked_S   891 non-null    uint8   
 14  Sex_female   891 non-null    uint8   
 15  Sex_male     891 non-null    uint8   
 16  Deck_A       891 non-null    uint8   
 17  Deck_B       891 non-null    uint8   
 18  Deck_C       891 non-null    u

In [42]:
'''
Now convert The Ticket Column which is in Category type to numeric
'''
# Turn categorical variables into numbers
for label, content in train_data.items():
    # Check columns which *aren't* numeric
    if not pd.api.types.is_numeric_dtype(content):
        # We add the +1 because pandas encodes missing categories as -1
        train_data[label] = pd.Categorical(content).codes+1

In [43]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Ticket       891 non-null    int16  
 7   Fare         891 non-null    float64
 8   Title_Miss   891 non-null    uint8  
 9   Title_Mr     891 non-null    uint8  
 10  Title_Mrs    891 non-null    uint8  
 11  Embarked_C   891 non-null    uint8  
 12  Embarked_Q   891 non-null    uint8  
 13  Embarked_S   891 non-null    uint8  
 14  Sex_female   891 non-null    uint8  
 15  Sex_male     891 non-null    uint8  
 16  Deck_A       891 non-null    uint8  
 17  Deck_B       891 non-null    uint8  
 18  Deck_C       891 non-null    uint8  
 19  Deck_D  

In [44]:
train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Ticket,Fare,Title_Miss,Title_Mr,...,Sex_female,Sex_male,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_None
0,1,0,3,22.0,1,0,524,7.25,0,1,...,0,1,0,0,0,0,0,0,0,1
1,2,1,1,38.0,1,0,597,71.2833,0,0,...,1,0,0,0,1,0,0,0,0,0
2,3,1,3,26.0,0,0,670,7.925,1,0,...,1,0,0,0,0,0,0,0,0,1
3,4,1,1,35.0,1,0,50,53.1,0,0,...,1,0,0,0,1,0,0,0,0,0
4,5,0,3,35.0,0,0,473,8.05,0,1,...,0,1,0,0,0,0,0,0,0,1


In [45]:
train_data['Ticket'].value_counts()

334    7
569    7
81     7
250    6
567    6
      ..
514    1
99     1
213    1
607    1
467    1
Name: Ticket, Length: 681, dtype: int64

In [46]:
'''
Let's Recall What we did to this data step by step.

1. First remove or replace NaN values. i.e dealing with missing Datas
2. Convert some string columns into more suitable string columns. 
	for eg. we converted Name to Title
    		Cabin to Deck
3. Get Dummies Data(sub category) for string datas which had common values.
4. If not able to create sub class we convert it into Category datatype and converted into numeric.

'''
#Note : We Have to Do This for TEST DATA TOO!!


#For Missing Datas in test data we have to check tat that particular time.
#But for step 2 and 3 it is same for every df and train_Data so that will be same.
'''
Let's make some function for step 2 and 3.
'''


def gettitle(df):
    Title = []
    for i in range(len(df)):
        if df['Sex'][i] == 'male':
            Title.append('Mr')
        else:
            if df['Age'][i] >=30:
                Title.append('Mrs')
            else:
                Title.append('Miss')
    return Title




def Deck(df):
    cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G']
    deck =[]
    for i in range(len(df["Cabin"])):
        for j in cabin_list :
            if j in df["Cabin"][i] :
                deck.append(j)
                break

        else:
            deck.append("None")
    
    return deck

def FeatureEngineering(df):
    Title  = gettitle(df)
    
    df['Title'] = Title
    df.drop('Name',axis =1,inplace = True)
    
    
    deck = Deck(df)
    df['Deck'] = deck
    df.drop('Cabin',axis=1,inplace = True)
    
    #dummies
    
    df = pd.get_dummies(df,columns=['Title','Embarked','Sex','Deck'])
    
    #Categorical data
    
    for label, content in df.items():
    	if pd.api.types.is_string_dtype(content):
        	df[label] = content.astype("category").cat.as_ordered()

    # Turn categorical variables into numbers
    for label, content in df.items():
        # Check columns which *aren't* numeric
        if not pd.api.types.is_numeric_dtype(content):
            # We add the +1 because pandas encodes missing categories as -1
            df[label] = pd.Categorical(content).codes+1
    
    return df

In [47]:
''' 

Lets Start Model Trainig

'''

x = train_data.drop('Survived',axis = 1)
y = train_data['Survived']

In [48]:
np.random.seed(42)
from sklearn.model_selection import train_test_split

In [49]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.3)

In [50]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model.fit(x_train,y_train)

RandomForestClassifier()

In [51]:
model.score(x_test,y_test)

0.7985074626865671

In [52]:
model.fit(x,y)

RandomForestClassifier()

In [53]:
import pickle

In [54]:
pickle.dump(model, open('model.pkl', 'wb'))

In [55]:
test_data = pd.read_csv('test.csv')

In [56]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB


In [57]:
test_data['Age'].fillna(test_data['Age'].median(),inplace =True)
test_data['Fare'].fillna(test_data['Fare'].mean(),inplace =True)
test_data['Cabin'].fillna('None',inplace =True)

In [58]:
test_data.isna().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64

In [59]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [60]:
#title
def gettitle(df):
    Title = []
    for i in range(len(df)):
        if df['Sex'][i] == 'male':
            Title.append('Mr')
        else:
            if df['Age'][i] >=30:
                Title.append('Mrs')
            else:
                Title.append('Miss')
    return Title

Title  = gettitle(test_data)
    
test_data['Title'] = Title
test_data.drop('Name',axis =1,inplace = True)

In [61]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,male,34.5,0,0,330911,7.8292,,Q,Mr
1,893,3,female,47.0,1,0,363272,7.0,,S,Mrs
2,894,2,male,62.0,0,0,240276,9.6875,,Q,Mr
3,895,3,male,27.0,0,0,315154,8.6625,,S,Mr
4,896,3,female,22.0,1,1,3101298,12.2875,,S,Miss


In [62]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Title
0,892,3,male,34.5,0,0,330911,7.8292,,Q,Mr
1,893,3,female,47.0,1,0,363272,7.0,,S,Mrs
2,894,2,male,62.0,0,0,240276,9.6875,,Q,Mr
3,895,3,male,27.0,0,0,315154,8.6625,,S,Mr
4,896,3,female,22.0,1,1,3101298,12.2875,,S,Miss


In [63]:
test_data = pd.get_dummies(test_data,columns=['Title'])

In [64]:
test_data = pd.get_dummies(test_data,columns=['Embarked','Sex'])

In [65]:
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G']
deck =[]

for i in range(len(test_data["Cabin"])):
    for j in cabin_list :
        if j in test_data["Cabin"][i] :
            deck.append(j)
            break
    
    else:
        deck.append("None")

In [66]:
test_data['Deck'] = deck
test_data.drop('Cabin',axis=1,inplace=True)

In [67]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 16 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Age          418 non-null    float64
 3   SibSp        418 non-null    int64  
 4   Parch        418 non-null    int64  
 5   Ticket       418 non-null    object 
 6   Fare         418 non-null    float64
 7   Title_Miss   418 non-null    uint8  
 8   Title_Mr     418 non-null    uint8  
 9   Title_Mrs    418 non-null    uint8  
 10  Embarked_C   418 non-null    uint8  
 11  Embarked_Q   418 non-null    uint8  
 12  Embarked_S   418 non-null    uint8  
 13  Sex_female   418 non-null    uint8  
 14  Sex_male     418 non-null    uint8  
 15  Deck         418 non-null    object 
dtypes: float64(2), int64(4), object(2), uint8(8)
memory usage: 29.5+ KB


In [68]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Ticket       891 non-null    int16  
 7   Fare         891 non-null    float64
 8   Title_Miss   891 non-null    uint8  
 9   Title_Mr     891 non-null    uint8  
 10  Title_Mrs    891 non-null    uint8  
 11  Embarked_C   891 non-null    uint8  
 12  Embarked_Q   891 non-null    uint8  
 13  Embarked_S   891 non-null    uint8  
 14  Sex_female   891 non-null    uint8  
 15  Sex_male     891 non-null    uint8  
 16  Deck_A       891 non-null    uint8  
 17  Deck_B       891 non-null    uint8  
 18  Deck_C       891 non-null    uint8  
 19  Deck_D  

In [69]:
test_data = pd.get_dummies(test_data,columns=['Deck'])

In [70]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 23 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Age          418 non-null    float64
 3   SibSp        418 non-null    int64  
 4   Parch        418 non-null    int64  
 5   Ticket       418 non-null    object 
 6   Fare         418 non-null    float64
 7   Title_Miss   418 non-null    uint8  
 8   Title_Mr     418 non-null    uint8  
 9   Title_Mrs    418 non-null    uint8  
 10  Embarked_C   418 non-null    uint8  
 11  Embarked_Q   418 non-null    uint8  
 12  Embarked_S   418 non-null    uint8  
 13  Sex_female   418 non-null    uint8  
 14  Sex_male     418 non-null    uint8  
 15  Deck_A       418 non-null    uint8  
 16  Deck_B       418 non-null    uint8  
 17  Deck_C       418 non-null    uint8  
 18  Deck_D       418 non-null    uint8  
 19  Deck_E  

In [71]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Ticket       891 non-null    int16  
 7   Fare         891 non-null    float64
 8   Title_Miss   891 non-null    uint8  
 9   Title_Mr     891 non-null    uint8  
 10  Title_Mrs    891 non-null    uint8  
 11  Embarked_C   891 non-null    uint8  
 12  Embarked_Q   891 non-null    uint8  
 13  Embarked_S   891 non-null    uint8  
 14  Sex_female   891 non-null    uint8  
 15  Sex_male     891 non-null    uint8  
 16  Deck_A       891 non-null    uint8  
 17  Deck_B       891 non-null    uint8  
 18  Deck_C       891 non-null    uint8  
 19  Deck_D  

In [72]:
#Categorical data

for label, content in test_data.items():
    if pd.api.types.is_string_dtype(content):
        test_data[label] = content.astype("category").cat.as_ordered()

# Turn categorical variables into numbers
for label, content in test_data.items():
    # Check columns which *aren't* numeric
    if not pd.api.types.is_numeric_dtype(content):
        # We add the +1 because pandas encodes missing categories as -1
        test_data[label] = pd.Categorical(content).codes+1
    


In [73]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Ticket,Fare,Title_Miss,Title_Mr,Title_Mrs,...,Sex_female,Sex_male,Deck_A,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_None
0,892,3,34.5,0,0,153,7.8292,0,1,0,...,0,1,0,0,0,0,0,0,0,1
1,893,3,47.0,1,0,222,7.0,0,0,1,...,1,0,0,0,0,0,0,0,0,1
2,894,2,62.0,0,0,74,9.6875,0,1,0,...,0,1,0,0,0,0,0,0,0,1
3,895,3,27.0,0,0,148,8.6625,0,1,0,...,0,1,0,0,0,0,0,0,0,1
4,896,3,22.0,1,1,139,12.2875,1,0,0,...,1,0,0,0,0,0,0,0,0,1


In [74]:
loaded_model = pickle.load(open('model.pkl', 'rb'))

In [75]:
loaded_model.predict(test_data)

array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [76]:
result = pd.DataFrame({"PassengerId" : test_data["PassengerId"] , "Survived" : model.predict(test_data)})

In [77]:
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [78]:
result.to_csv("test_data_result.csv",index=False)