<a href="https://www.kaggle.com/code/drewftw260/machine-learning-titanic-resubmisssion?scriptVersionId=175109670" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
#!/usr/bin/env python
# coding: utf-8

# In[1]:


## Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


# In[2]:


ttrn = pd.read_csv("train.csv")
ttrn.head(20)


# In[3]:


##Getting information about the dataset
ttrn.info()


# In[4]:


##Number of rows and columns
ttrn.shape


# In[5]:


##Summary Statistics
ttrn.describe()


# # Notes:
# Survived = 1
# Deceased = 0
# Cabin and age columns are missing values.
# SibSp = Siblings/spouse
# Parch = Parent/children
# Embarked = C = Cherbourg, Q = Queenstown, S = Southhampton

# ## - Data Cleaning - 

# In[6]:


## dropping the Name, cabin and passengerId columns
ttrn.drop(['Cabin', 'Name', 'PassengerId', 'Ticket'], axis = 1, inplace = True)


# In[7]:


ttrn.head()


# In[8]:


## Finding the mean of the age column
mn = ttrn['Age'].mean()
ttrn['Embarked'] = ttrn['Embarked'].fillna('S')


# In[9]:


print(mn)


# In[10]:


ttrn['Age'].fillna(value = mn, inplace = True)


# In[11]:


ttrn.head()


# In[12]:


#Changing float values to int
ttrn['Age'] = ttrn['Age'].astype('int64')
ttrn['Fare'] = ttrn['Fare'].astype('int64')


# In[13]:


ttrn.head()


# # - Data Visualization -

# In[14]:


# How many people survived? 1 = survived, 0 = Died
ttrn['Survived'].value_counts()


# In[15]:


# Visual of people that survived or not
ttrn['Survived'].value_counts().plot.bar(color = 'yellow')


# In[16]:


# Which gender had more that survived?
ttrn.groupby('Sex')['Survived'].value_counts().plot(kind='bar')


# More Females survived and more males died.

# In[17]:


#Which Class had the highest survival rate?
ttrn.groupby('Pclass')['Survived'].value_counts().plot(kind = 'bar', color = 'green')


# More people survived in first class and more people died in 3rd class.

# In[18]:


# Survival rate of siblings/spouse
ttrn.groupby('SibSp')['Survived'].value_counts().plot(kind = 'bar', color = 'pink')


# People with 2 or more siblings or spouse had a higher chance of surviving than people with 0 to 1 siblings/spouse.

# In[19]:


#Survival rate of parents/children?
ttrn.groupby('Parch')['Survived'].value_counts().plot(kind = 'bar', color = 'purple')


# People with 2 or more children had a higher survival chance than 0 to 1.

# In[20]:


# Survival rate by embarked type?
ttrn.groupby('Embarked')['Survived'].value_counts().plot(kind='bar', color = 'red')


# People that embarked from cherbourg has a higher survival rate than the other ports.

# # - Feature Engineering -

# In[21]:


ttrn['Sex'] = ttrn['Sex'].map({'male':1, 'female': 2})
ttrn['Embarked'] = ttrn['Embarked'].map({'S':1, 'C':2, 'Q':3})
ttrn['Embarked'] = ttrn['Embarked'].astype('int64')


# In[22]:


from sklearn.model_selection import train_test_split


# In[23]:


x = ttrn.drop('Survived', axis =1)
y = ttrn['Survived']


# In[24]:


## Splitting the data
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.5, random_state=1)


# In[25]:


from sklearn.linear_model import LogisticRegression


# In[26]:


ttrn.head()


# In[27]:


mdl = LogisticRegression()
mdl.fit(x_train,y_train)
mdl.score(x_test, y_test)


# In[28]:


from sklearn.svm import SVC


# In[29]:


svc = SVC()
svc.fit(x_train, y_train)
svc.score(x_test, y_test)


# In[30]:


from sklearn.neighbors import KNeighborsClassifier


# In[31]:


kn = KNeighborsClassifier(n_neighbors = 3)
kn.fit(x_train, y_train)
kn.score(x_test,y_test)


# In[32]:


pred = mdl.predict(x_test)


# In[33]:


from sklearn import metrics


# In[34]:


print('Accuracy:', metrics.accuracy_score(y_test, pred))


# ## - Importing and Processing - 

# In[35]:


ttst = pd.read_csv('test.csv')


# In[36]:


ttst.head()


# In[37]:


ttst.isnull().sum()


# In[38]:


pID = ttst['PassengerId'].values


# In[39]:


len(ttst)


# In[40]:


def process(tst):
    
    #Dropping unnecessary Columns
    tst.drop(['Cabin', 'Name', 'PassengerId', 'Ticket'], axis = 1, inplace = True)
    
    # Finding the age mean and filling na
    mn = tst['Age'].mean()
    tst['Embarked'] = tst['Embarked'].fillna('S')
    tst['Age'].fillna(value = mn, inplace = True)
    tst['Fare'] = tst['Fare'].fillna(0)
    
    # CHanging float to int
    tst['Age'] = tst['Age'].astype('int64')
    tst['Fare'] = tst['Fare'].astype('int64')
    
    #Mapping Values
    tst['Sex'] = tst['Sex'].map({'male':1, 'female': 2})
    tst['Embarked'] = tst['Embarked'].map({'S':1, 'C':2, 'Q':3})
    tst['Embarked'] = tst['Embarked'].astype('int64')
    
    return tst
    
    


# In[41]:


ttst.isnull().sum()


# In[42]:


ttst = process(ttst)


# In[43]:


ttst.head()


# In[44]:


y_prediction = mdl.predict(ttst)


# In[45]:


tsub = pd.DataFrame({'PassengerId':pID, 'Survived':y_prediction})


# In[46]:


tsub.head()


# In[47]:


tsub.to_csv('Titanic_resub.csv',index = False)


# In[ ]: