In [None]:
import pandas as pd
import numpy as np
import random as rd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier

In [None]:
train_df = pd.read_csv('train_Titanic.csv')
test_df =pd.read_csv('test_Titanic.csv')
combine = [train_df, test_df]

In [None]:
print(train_df.shape)

In [None]:
print(test_df.shape)

In [None]:
print(train_df.columns)

In [None]:
#categorical - (Survived, Sex, Embarked , PClass) |  Ordinal: PClass

#Numerical = (Age, Fare, SibSp, Parch) |  #Continous - Age, Fare | Discrete - SibSp, Parch 

In [None]:
#Ticket - Alpanumeric
#Cabin - Alphanumeric

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df.describe()

In [None]:
#50% mean
#+1 std - 50% +34.1%
#2 std - 50% +34.1% + 13.6

95% of data is within 2 std deviations

In [None]:
train_df.describe(include =['O'])

In [None]:
train_df['Embarked'].value_counts()

In [None]:
train_df['Sex'].value_counts()

In [None]:
train_df['Cabin'].value_counts()

In [None]:
f, ax = plt.subplots(1, 2 , figsize=(18,8))

train_df['Survived'].value_counts().plot.pie(explode=[0, 0.1], 
                                             autopct = '%1.1f%%',
                                            ax=ax[0],
                                            shadow=True)
ax[0].set_title('Survived')
ax[0].set_ylabel('')
sns.countplot('Survived', 
              data=train_df,
             ax=ax[1])
ax[1].set_title('Survived')
plt.show()


In [None]:
#categorical - Nominal
#Ordinal
#continous

In [None]:
train_df.groupby(['Sex', 'Survived'])['Survived'].count()

In [None]:
f, ax = plt.subplots(1, 2, figsize=(18,8))
train_df[['Sex', 'Survived']].groupby(['Sex']).mean().plot.bar(ax=ax[0])
ax[0].set_title('Survived vs Sex')
sns.countplot('Sex', hue='Survived', data=train_df, ax=ax[1])
ax[1].set_title('Sex:Survived vs Dead')
plt.show()

## Extend this to other columns - 

In [None]:
pd.crosstab([train_df.Sex, train_df.Survived], train_df.Pclass, margins=True).style.background_gradient(cmap='summer_r')

In [None]:
sns.factorplot('Pclass', 'Survived', hue='Sex', data=train_df)
plt.show()

In [None]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
f,ax=plt.subplots(1,2,figsize=(18,8))
sns.violinplot("Pclass","Age", hue="Survived", data=train_df,split=True,ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(0,110,10))
sns.violinplot("Sex","Age", hue="Survived", data=train_df,split=True,ax=ax[1])
ax[1].set_title('Sex and Age vs Survived')
ax[1].set_yticks(range(0,110,10))
plt.show()

In [None]:
train_df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)

In [None]:
#corelating numeric features
#corelating numerical and ordinal feature
#corelating b/w categorical

In [None]:
#corelating numeric features

g=sns.FacetGrid(train_df, col='Survived')
g.map(plt.hist, 'Age', bins=20)

In [None]:
#corelating numerical and ordinal feature

grid=sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=0.5, bins=20)
grid.add_legend()

In [None]:
#corelating b/w categorical

grid=sns.FacetGrid(train_df, row='Embarked', size=2.2, aspect=1.6)
grid.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', palette = 'deep')
grid.add_legend()

In [None]:
#Data wrangling
#categorical

#convert to a number
#convert to number bin a number

#combine levels
#using frequency of response rate

#Dummy variables

In [None]:
#high cardinality

In [None]:
a - 90% - 3
b - 80% - 5
c - 70% - 12

In [None]:
#completing numerical features

In [None]:
#completing category features

In [None]:
#SESSION - 10

print(train_df.shape, test_df.shape)
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)
print(train_df.shape, test_df.shape)


In [None]:
#regex - matches the first word which ends with a dot character 
import re
import pandas as pd

for dataset in combine:
    dataset['Title']=dataset.Name.str.extract('([A-Za-z]+)\.', expand=False)

In [None]:
combine

In [None]:
test_df.columns

In [None]:
for dataset in combine:
    dataset['Title']=dataset['Title'].replace(['Lady', 'Countess', 'Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev', 'Sir',
                                              'Jonkheer', 'Dona'], 'Rare')
    dataset['Title']=dataset['Title'].replace('Mlle', 'Miss')
    dataset['Title']=dataset['Title'].replace('Ms', 'Miss')
    dataset['Title']=dataset['Title'].replace('Mme', 'Mrs')

#train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean()

In [None]:
combine

In [None]:
type(combine)

In [None]:
df=pd.DataFrame.from_records(combine)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
df

In [None]:
df.shape

In [None]:
combine

In [None]:
train_df=train_df.drop(['PassengerId'], axis=1)
train_df.shape

In [None]:
train_df.columns

In [None]:
test_df=test_df.drop(['PassengerId'], axis=1)
test_df.shape

In [None]:
#categorical vars
#numerical

grid = sns.FacetGrid(train_df, row='Pclass', col='Sex', size=2.2, aspect=1.0)
grid.map(plt.hist, 'Age', alpha=0.5, bins=20)
grid.add_legend()


In [None]:
train_df.dtypes

In [None]:
train_df=train_df.drop(['Name', 'Sex', 'Cabin', 'Embarked'], axis=1)
test_df=test_df.drop(['Name', 'Sex', 'Cabin', 'Embarked'], axis=1)

In [None]:
train_df.dtypes

In [None]:
test_df.columns

In [None]:
x_train=train_df.drop(['Survived'], axis=1)
y_train=train_df['Survived']

x_test=test_df

x_train.shape, y_train, x_test.shape

In [None]:
x_train=x_train.fillna(-1)
x_test=x_test.fillna(-1)

In [None]:
logreg = LogisticRegression()
logreg.fit(x_train, y_train)
y_pred=logreg.predict(x_test)
acc_log=round(logreg.score(x_train, y_train) * 100, 2)
acc_log

In [None]:
clf=LinearSVC()

clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc_log=round(clf.score(x_train, y_train) * 100, 2)
acc_log

In [None]:
clf=DecisionTreeClassifier()

clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc_log=round(clf.score(x_train, y_train) * 100, 2)
acc_log

In [None]:
clf=RandomForestClassifier()

clf.fit(x_train, y_train)
y_pred=clf.predict(x_test)
acc_log=round(clf.score(x_train, y_train) * 100, 2)
acc_log

In [None]:
#cross validation