## Supervised Learning
## Project: Titanic: Machine Learning from Disaster (Kaggle)

This is the legendary Titanic ML competition – the best, first challenge for you to dive into ML competitions and familiarize yourself with how the Kaggle platform works.

The competition is simple: use machine learning to create a model that predicts which passengers survived the Titanic shipwreck.

So we will be implementing basic Techniques of Machine Learning to predict the Survival of passengers of Titanic

### 1. Data Exploration

In [484]:
#Importing all the necassary Libraries to be used in the Code
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score 

In [485]:
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [486]:
#To Identify if there are any Null values in the input Columns
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### 2. Preparing the Data

In [487]:
#As we see Cabin column as majority as null values we will drop this column from the dataframe
df.drop(columns=['Cabin'],inplace=True)

In [488]:
#Updating the values of Fare and Age to mean value of the complete column
df['Fare'].fillna(df['Fare'].mean(),inplace = True)
df['Age'].fillna(df['Age'].mean(),inplace = True)
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       2
dtype: int64

In [489]:
#Now Two Columns are String with various Alphanumerics. We will create Count Vectorizer to create meaning full information from them
#First column is 'NAME'

count_vector_name = CountVectorizer(stop_words='english')

#This will give a matrix 0 and 1 according to the columns created

matrix_name = count_vector_name.fit_transform(df['Name'])

In [490]:
#Second column is 'Ticket'

count_vector_Ticket = CountVectorizer(stop_words='english')

#This also will give a matrix 0 and 1 according to the columns created

matrix_ticket = count_vector_Ticket.fit_transform(df['Ticket'])

In [491]:
#Now we will create a Dataframe from Vectorised data columns 'NAME'

matrix_name = count_vector_name.transform(df['Name']).toarray()
frequency_matrix_name = pd.DataFrame(matrix_name,columns=count_vector_name.get_feature_names())

In [493]:
#Similary  we will create a Dataframe from Vectorised data columns 'Ticket'

matrix_Ticket = count_vector_Ticket.transform(df['Name']).toarray()
frequency_matrix_Ticket = pd.DataFrame(matrix_Ticket,columns=count_vector_Ticket.get_feature_names())

In [494]:
#Here we will merge the Dataframe to original dataframe

df = pd.merge(df, frequency_matrix_name, how='inner', left_index=True, right_index=True)
df = pd.merge(df, frequency_matrix_Ticket, how='inner', left_index=True, right_index=True)

In [497]:
# As we have remove 'Name', 'Ticket' columns from the data frame

df.drop(columns=['Name','Ticket'],inplace=True)

In [499]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,aaron,...,o2,oq,paris,pc,pp,sc,sco,soton,ston,sw
0,1,0,3,male,22.0,1,0,7.25,S,0,...,0,0,0,0,0,0,0,0,0,0
1,2,1,1,female,38.0,1,0,71.2833,C,0,...,0,0,0,0,0,0,0,0,0,0
2,3,1,3,female,26.0,0,0,7.925,S,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1,1,female,35.0,1,0,53.1,S,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,3,male,35.0,0,0,8.05,S,0,...,0,0,0,0,0,0,0,0,0,0


In [500]:
#Now creating an Index of dataframe using Passenger Ids

df.set_index('PassengerId', inplace =True)

In [501]:
df.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
           ..
sc          0
sco         0
soton       0
ston        0
sw          0
Length: 2205, dtype: int64

### 3. Preprocessing the Data

In [502]:
#Now we will be Feature scaling certain features for better extraction of Information
scaler = MinMaxScaler()
numerical = ['Fare']
df[numerical] = scaler.fit_transform(df[numerical])

In [503]:
#Creating a Label and features separately
df_survived = df['Survived']
df.drop(columns=['Survived'],inplace=True)

In [None]:
#This step is One step Encoding to convert the dataframe in the machine readable format
df_final = pd.get_dummies(df)

### 4. Shuffle and Split Data

In [505]:
# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_final, 
                                                    df_survived,
                                                    random_state = 0)

### 5.  Evaluating Model Performance

In [507]:
#Here we will compute and compare accuracies of various Supervised Learning Algorithms and see which one performs the best
clf1 = DecisionTreeClassifier(random_state  = 10 )
clf2 = GaussianNB()
clf3 = RandomForestClassifier()
clf4 = LogisticRegression()

learner1 = clf1.fit(X_train , y_train)
learner2 = clf2.fit(X_train , y_train)
learner3 = clf3.fit(X_train , y_train)
learner4 = clf4.fit(X_train , y_train)

predict1 = learner1.predict(X_test)
predict2 = learner2.predict(X_test)
predict3 = learner3.predict(X_test)
predict4 = learner4.predict(X_test)

accuracy1=  accuracy_score(y_test,predict1 )
accuracy2=  accuracy_score(y_test,predict2 )
accuracy3=  accuracy_score(y_test,predict3 )
accuracy4=  accuracy_score(y_test,predict4 )
print (accuracy1)
print (accuracy2)
print (accuracy3)
print (accuracy4)

0.8116591928251121
0.5246636771300448
0.8295964125560538
0.8251121076233184


### 6. Implementing the Model 

In [508]:
#Here we read the Testing Dataset provided
testing_df = pd.read_csv('test.csv')
testing_df.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [509]:
def test_transform(testing_df):
    
    # Removing unwanted Columns
    
    testing_df.drop(columns=['Cabin'],inplace=True)
    
    # Updating missing column values
    
    testing_df['Age'].fillna(testing_df['Age'].mean(),inplace = True)
    testing_df['Fare'].fillna(testing_df['Fare'].mean(),inplace = True)
    
    # Creating Count Vectors for columns with string columns
    
    matrix_test_name = count_vector_name.transform(testing_df['Name']).toarray()
    matrix_test_ticket = count_vector_Ticket.transform(testing_df['Ticket']).toarray()
    
    # Creating dataframe for 'Name' and 'Ticket' columns which have been vectorised now
    
    frequency_matrix_test_name = pd.DataFrame(matrix_test_name,columns=count_vector_name.get_feature_names())
    frequency_matrix_test_Ticket = pd.DataFrame(matrix_test_ticket,columns=count_vector_Ticket.get_feature_names())
    
    # Merging the dataframes together
    
    testing_df = pd.merge(testing_df, frequency_matrix_test_name, how='inner', left_index=True, right_index=True)
    testing_df = pd.merge(testing_df, frequency_matrix_test_Ticket, how='inner', left_index=True, right_index=True)
    
    # Dropping Name and Ticket Columns
    
    testing_df.drop(columns=['Name','Ticket'],inplace=True)
    
    # One step encoding data
    
    testing_dummied = pd.get_dummies(testing_df)
    testing_dummied.set_index('PassengerId',inplace = True)
    
    return testing_dummied

In [510]:
testing_dummied = test_transform(testing_df)
testing_dummied

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,aaron,abbing,abbott,abelson,abraham,...,sc,sco,soton,ston,sw,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
892,3,34.50000,0,0,7.8292,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
893,3,47.00000,1,0,7.0000,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
894,2,62.00000,0,0,9.6875,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,0
895,3,27.00000,0,0,8.6625,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
896,3,22.00000,1,1,12.2875,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1305,3,30.27259,0,0,8.0500,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1306,1,39.00000,0,0,108.9000,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1307,3,38.50000,0,0,7.2500,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,1
1308,3,30.27259,0,0,8.0500,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


### 7. Predicting the values

In [512]:
test_predicted = learner4.predict(testing_dummied)
test_predicted = pd.DataFrame(test_predicted, columns = ['Survived'])

In [514]:
test_predicted.head()

Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1


### 8. Creating output to be submitted

In [518]:
test_predicted.reset_index(inplace= True)
testing_dummied.reset_index(inplace= True)
new_df = pd.merge(testing_dummied, test_predicted,how = 'outer',left_on=None, right_on=None, left_index=True, right_index=True)
new_df.head()

Unnamed: 0,index_x,PassengerId,Pclass,Age,SibSp,Parch,Fare,aaron,abbing,abbott,...,ston,sw,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,level_0,index_y,Survived
0,0,892,3,34.5,0,0,7.8292,0,0,0,...,0,0,0,1,0,1,0,0,0,0
1,1,893,3,47.0,1,0,7.0,0,0,0,...,0,0,1,0,0,0,1,1,1,1
2,2,894,2,62.0,0,0,9.6875,0,0,0,...,0,0,0,1,0,1,0,2,2,0
3,3,895,3,27.0,0,0,8.6625,0,0,0,...,0,0,0,1,0,0,1,3,3,0
4,4,896,3,22.0,1,1,12.2875,0,0,0,...,0,0,1,0,0,0,1,4,4,1


In [522]:
#Extracting only PassengerId and Survived Columns
final_df = new_df[['PassengerId','Survived']]

In [523]:
final_df.to_csv('output.csv', index = None)