# TITANIC PROJECT

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
titanic = pd.read_csv('titanic_train.csv')
print(titanic.shape)
titanic.head(10)

In [None]:
titanic_test = pd.read_csv('titanic_test.csv')
print(titanic_test.shape)
titanic_test.head(10)

In [None]:
titanic.columns.values

In [None]:
titanic.info()

In [None]:
####Checking Null Values in Training Dataset

In [None]:
titanic.isnull().sum()

In [None]:
titanic['Age'] = titanic['Age'].replace(np.NaN,titanic['Age'].mean())

In [None]:
titanic.drop(columns='Cabin',axis=1,inplace=True)

In [None]:
titanic['Embarked'].value_counts()

In [None]:
titanic['Embarked'].fillna('S',inplace=True)

In [None]:
print(titanic.isnull().sum())
sns.heatmap(titanic.isnull())

In [None]:
#### Checking For unique and non-unique Values

In [None]:
print(titanic['Parch'].unique())
print(titanic['Sex'].unique())
print(titanic['Embarked'].unique())
print(titanic['Survived'].unique())

In [None]:
titanic.head()

In [None]:
titanic.set_index('PassengerId',inplace=True)  #### Changing index to passengerID

In [None]:
titanic[:5]

In [None]:
#### Doing Some Feature Selection:

In [None]:
titanic['Family Size'] = titanic['SibSp']+titanic['Parch']
titanic.head()

In [None]:
def family_type(num):
    if num == 0:
        return 'Alone'
    if num == 1:
        return 'Medium'
    else:
        return 'Large'

In [None]:
titanic['Family Size'] = titanic['Family Size'].apply(family_type)

In [None]:
titanic.drop(columns=['SibSp','Parch','Name'],axis=1,inplace=True)

In [None]:
titanic.head()

In [None]:
titanic['Sex'] = titanic['Sex'].apply(lambda x:0 if x == 'female' else 1)

In [None]:
titanic['Sex'].value_counts()

In [None]:
titanic.head()

In [None]:
#### Removing Outliers from the dataset:
titanic.describe()

In [None]:
figure=titanic.boxplot(column="Age")

In [None]:
titanic['Age'].describe()

In [None]:
figure=titanic.Age.hist(bins=10)
figure.set_title('Age')
figure.set_xlabel('Age')
figure.set_ylabel('No of passenger')

In [None]:
##### Assuming Age follows A Gaussian Distribution we will calculate the boundaries which differentiates the outliers

uppper_boundary=titanic['Age'].mean() + 3* titanic['Age'].std()
lower_boundary=titanic['Age'].mean() - 3* titanic['Age'].std()
print(lower_boundary), print(uppper_boundary),print(titanic['Age'].mean())

In [None]:
titanic.loc[titanic['Age']>=73,'Age']=73

In [None]:
titanic['Age'].head()

In [None]:
figure=titanic['Fare'].hist(bins=10)
figure.set_title('Fare')
figure.set_xlabel('Fare')
figure.set_ylabel('No of passenger')

In [None]:
#### Lets compute the Interquantile range to calculate the boundaries
IQR=titanic.Fare.quantile(0.75)-titanic.Fare.quantile(0.25)

In [None]:
lower_bridge=titanic['Fare'].quantile(0.25)-(IQR*1.5)
upper_bridge=titanic['Fare'].quantile(0.75)+(IQR*1.5)
print(lower_bridge), print(upper_bridge)

In [None]:
#### Extreme outliers
lower_bridge=titanic['Fare'].quantile(0.25)-(IQR*3)
upper_bridge=titanic['Fare'].quantile(0.75)+(IQR*3)
print(lower_bridge), print(upper_bridge)

In [None]:
titanic.loc[titanic['Fare']>=100,'Fare']=100

In [None]:
titanic.head()

In [None]:
#### Descriptive Staticics EDA:

In [None]:
#### Checking the people lost their life.
sns.countplot(titanic['Survived'])
death_percent = round((titanic['Survived'].value_counts().values[0]/891)*100)       
print("out of 891 {} people died in the accident".format(death_percent))

In [None]:
#### Checking Pclass Percentages
sns.countplot(titanic['Pclass'])
print((titanic['Pclass'].value_counts()/891)*100)

In [None]:
#### Checking Male and Female Percentages
sns.countplot(titanic['Sex'])
print((titanic['Sex'].value_counts()/891)*100)

In [None]:
sns.countplot(titanic['Embarked'])
print((titanic['Embarked'].value_counts()/891)*100)

In [None]:
#### Doing Some More Analysis:

In [None]:
print("People with age inbetween 60 and 70 are",titanic[(titanic['Age']>60) & (titanic['Age']<70)].shape[0])

In [None]:
print("People with age greater than 70 and 75 are",titanic[(titanic['Age']>70) & (titanic['Age']<75)].shape[0])

In [None]:
print("People with age inbetween 20 to 50 are",titanic[(titanic['Age']>20) & (titanic['Age']<50)].shape[0])

In [None]:
print("People with age greater than 75 are",titanic[titanic['Age']>75].shape[0])

In [None]:
print("children with age inbetween 0 to 1 are",titanic[titanic['Age']<1].shape[0])

In [None]:
print("People with fare inbetween $200 and $300 are",titanic[(titanic['Fare']>200) & (titanic['Fare']<300)].shape[0])
print("People with fare greater than $300 are",titanic[titanic['Fare']>300].shape[0])

In [None]:
#### Checking Gender wise people who survived.
sns.countplot(titanic['Survived'],hue=titanic['Sex'])
pd.crosstab(titanic['Pclass'], titanic['Survived']).apply(lambda r: round((r/r.sum())*100,1),axis=1)

In [None]:
sns.countplot(titanic['Embarked'],hue=titanic['Sex'])
pd.crosstab(titanic['Embarked'], titanic['Survived']).apply(lambda r: round((r/r.sum())*100,1),axis=1)

In [None]:
####People survival on basis of Age: 
plt.figure(figsize=(15,6))
sns.distplot(titanic[titanic['Survived'] == 0]['Age'])
sns.distplot(titanic[titanic['Survived'] == 1]['Age'])

In [None]:
print("people who didnt survived:",titanic[titanic['Survived'] == 0]['Age'].shape[0])

In [None]:
print("People who survived:",titanic[titanic['Survived'] == 1]['Age'].shape[0])

In [None]:
plt.figure(figsize=(15,6))
sns.distplot(titanic[titanic['Survived'] == 0]['Fare'])
sns.distplot(titanic[titanic['Survived'] == 1]['Fare'])

In [None]:
print("Fare of the people who didnt survived :",titanic[titanic['Survived'] == 0]['Fare'].shape[0])

In [None]:
print("Fare of the people who did survived :",titanic[titanic['Survived'] == 1]['Fare'].shape[0])

In [None]:
sns.countplot(titanic['Family Size'],hue=titanic['Survived'])
pd.crosstab(titanic['Family Size'], titanic['Survived']).apply(lambda r: round((r/r.sum())*100,1),axis=1)

In [None]:
####Checking thr corelations:
corrmat = titanic.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
g=sns.heatmap(titanic[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
#### Dropping unnecessary columns
titanic.drop(columns=['Ticket','Embarked','Family Size'],axis=1,inplace=True)

In [None]:
####Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder
cols = ["Age","Fare"]
le = LabelEncoder()
for col in cols:
    titanic[col] = le.fit_transform(titanic[col])

In [None]:
####Selecting Dependant and Independant Variable
X = titanic.drop('Survived',axis=1)
y = titanic['Survived']
print(X.shape)
print(y.shape)

In [None]:
#### Doing Training Testing split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 5)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report,confusion_matrix

In [None]:
#create parametre for hyperparametre tuning
model_param = {
    'DecisionTreeClassifier':{
        'model':DecisionTreeClassifier(),
        'param':{
            'criterion': ['gini','entropy'],
            'max_depth':[1,2,3,4,5,6,7],
            'min_samples_split':[2, 5, 10, 15, 100],
            'min_samples_leaf':[1, 2, 5, 10]
        }
    },
        'KNeighborsClassifier':{
        'model':KNeighborsClassifier(),
        'param':{
            'n_neighbors': [5,10,15,20,25]
        }
    },
        'SVC':{
        'model':SVC(),
        'param':{
            'kernel':['rbf','linear','sigmoid'],
            'C': [0.1, 1, 10, 100]
         
        }
    }
}

In [None]:
import warnings
warnings.filterwarnings('ignore')
scores =[]
for model_name, mp in model_param.items():
    model_selection = GridSearchCV(estimator=mp['model'],param_grid=mp['param'],cv=5,return_train_score=False)
    model_selection.fit(X_train,y_train)
    scores.append({
        'model': model_name,
        'best_score': model_selection.best_score_,
        'best_params': model_selection.best_params_
    })

In [None]:
titanic_model_score = pd.DataFrame(scores,columns=['model','best_score','best_params'])
titanic_model_score

In [None]:
svc = SVC(C=0.1,kernel='linear')
svc.fit(X_train,y_train)
svc.score(X_train,y_train)
y_pred= svc.predict(X_test)
print("accuracy_score:",accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
cm = confusion_matrix(y_test,y_pred)
sns.heatmap(cm,annot=True)