In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train = pd.read_csv('/kaggle/input/train.csv')
test = pd.read_csv('/kaggle/input/test.csv')
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import statistics


In [None]:
train.head()
train.shape
#Parch means total count of parentchild
#sibsp means total count of siblingspouse

In [None]:
#Problem Statement: Based on the data (features given) we have to predict whether the passenger has survived or not.
#Now, We'll be doing Explantory Data Analysis for that  we will check how many null values are there.
train.isnull().mean()
# Basically Cabin > Age > Embarked are the only columns that have null values.


In [None]:
#if the value is true then it is null value.but on the other hand it becomes difficult to see these values in a large dataset
#another problem is that it is not showing values for all the rows so its better to visualize it.
plt.figure(figsize=(8,7))
sns.heatmap(train.isnull(),yticklabels=False,cbar = False)
#We have not shown the gradient bar i.e the cbar because here we are not checking on the intensity.
#also we don't need row numbers so we have put yticklabels = false

In [None]:
#Interpretation: 1. The color lines shows the null values. 2. Most of the null values are in age column and cabin column and a few in embarked column.

In [None]:
train['Age'].isnull().sum()
c = (train['Age'].isnull().sum()/891)*100
print(c)
#So we see that almost 20% of the age data is missing.So, so me form of imputation can be applied.

In [None]:
#A count plot can be thought of as a histogram across a categorical, instead of quantitative, variable
sns.set_style('whitegrid')
plt.figure(figsize=(8,7))
sns.countplot(x='Survived',hue='Sex',data=train,palette='RdBu_r')

In [None]:
#Checking the count of the passengers according to their CLass
#https://seaborn.pydata.org/generated/seaborn.countplot.html
plt.figure(figsize=(7,7))
sns.set_style('whitegrid')
sns.countplot(x='Survived',hue='Pclass',data=train,palette='rainbow')

In [None]:
#Since the percentage of missing values of age is less so we can drop the null values.
#https://seaborn.pydata.org/generated/seaborn.distplot.html
#sns.distplot(train['Age'].dropna(),kde=False,color='darkblue',bins=40)
#here we are trying to see the distributio of age so that we can see the average age of the people. Also, on observation this plot looks like a normal distribution.
#kde = false because I don't want to see the probability distribution.

In [None]:
sns.countplot(x='SibSp',data=train)

In [None]:
train['Fare'].hist(color='green',bins=40,figsize=(8,4))

In [None]:
train['Age']= train['Age'].fillna(train['Age'].median())
test['Age']= test['Age'].fillna(test['Age'].median())

In [None]:
train.isnull().mean()

In [None]:
train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)

In [None]:
train.head()

In [None]:
#Converting Categorical Features
#We'll need to convert categorical features to dummy variables
train.info()

In [None]:
sex_train = pd.get_dummies(train['Sex'],drop_first=True)
embark_train = pd.get_dummies(train['Embarked'])
sex_test = pd.get_dummies(test['Sex'],drop_first=True)
embark_test = pd.get_dummies(test['Embarked'])

In [None]:
#Dropping the columns with categorical data and adding the changed ones.
train.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)
test.drop(['Sex','Embarked','Name','Ticket'],axis=1,inplace=True)

In [None]:
train = pd.concat([train,sex_train,embark_train],axis=1)
test = pd.concat([test,sex_test,embark_test],axis = 1)

In [None]:
test.head()

In [None]:
train['Survived'].head()

In [None]:
from sklearn.model_selection import train_test_split
Y_train = train['Survived']
X_train = train.drop('Survived',axis=1)
X_train.isnull().mean()

In [None]:
test["Fare"] = test["Fare"].fillna(test["Fare"].mean())
test.isnull().mean()


In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train,Y_train)
predictions = model.predict(test)
predictions

In [None]:
#set ids as PassengerId and predict survival 
ids = test['PassengerId']
len(ids)
len(predictions)


#set the output as a dataframe and convert to csv file named submission.csv
output = pd.DataFrame({ 'PassengerId' : ids, 'Survived': predictions })
output.to_csv('submission.csv', index=False)