IMPORTING LIBRARIES

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

READING FILE

In [2]:
df = pd.read_csv('titanic.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


DROPPING UNWANTED COLUMNS

In [4]:
cols = ['Name','Ticket','Cabin']
df = df.drop(cols,axis=1)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Sex          891 non-null    object 
 4   Age          714 non-null    float64
 5   SibSp        891 non-null    int64  
 6   Parch        891 non-null    int64  
 7   Fare         891 non-null    float64
 8   Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 62.8+ KB


DROPPING UNWANTED ROWS

In [6]:
df = df.dropna()

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Pclass       712 non-null    int64  
 3   Sex          712 non-null    object 
 4   Age          712 non-null    float64
 5   SibSp        712 non-null    int64  
 6   Parch        712 non-null    int64  
 7   Fare         712 non-null    float64
 8   Embarked     712 non-null    object 
dtypes: float64(2), int64(5), object(2)
memory usage: 55.6+ KB


CREATING DUMMY VARIABLES

In [8]:
dummies = []
cols = ['Pclass','Sex','Embarked']
for col in cols:
    dummies.append(pd.get_dummies(df[col]))

In [9]:
titanic_dummies = pd.concat(dummies, axis=1)

In [10]:
df = pd.concat((df,titanic_dummies),axis = 1)

In [11]:
df = df.drop(['Pclass','Sex','Embarked'],axis=1)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Age          712 non-null    float64
 3   SibSp        712 non-null    int64  
 4   Parch        712 non-null    int64  
 5   Fare         712 non-null    float64
 6   1            712 non-null    uint8  
 7   2            712 non-null    uint8  
 8   3            712 non-null    uint8  
 9   female       712 non-null    uint8  
 10  male         712 non-null    uint8  
 11  C            712 non-null    uint8  
 12  Q            712 non-null    uint8  
 13  S            712 non-null    uint8  
dtypes: float64(2), int64(4), uint8(8)
memory usage: 44.5 KB


In [13]:
df['Age'] = df['Age'].interpolate()

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 712 entries, 0 to 890
Data columns (total 14 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  712 non-null    int64  
 1   Survived     712 non-null    int64  
 2   Age          712 non-null    float64
 3   SibSp        712 non-null    int64  
 4   Parch        712 non-null    int64  
 5   Fare         712 non-null    float64
 6   1            712 non-null    uint8  
 7   2            712 non-null    uint8  
 8   3            712 non-null    uint8  
 9   female       712 non-null    uint8  
 10  male         712 non-null    uint8  
 11  C            712 non-null    uint8  
 12  Q            712 non-null    uint8  
 13  S            712 non-null    uint8  
dtypes: float64(2), int64(4), uint8(8)
memory usage: 44.5 KB


In [15]:
X = df.values
y = df['Survived'].values

In [16]:
x = np.delete(X,1,axis=1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [18]:
print("X_train:\n", X_train[:5])
print("X_test:\n", X_test[:5])
print("y_train:\n", y_train[:5])
print("y_test:\n", y_test[:5])

X_train:
 [[203.       0.      34.       0.       0.       6.4958   0.       0.
    1.       0.       1.       0.       0.       1.    ]
 [440.       0.      31.       0.       0.      10.5      0.       1.
    0.       0.       1.       0.       0.       1.    ]
 [103.       0.      21.       0.       1.      77.2875   1.       0.
    0.       0.       1.       0.       0.       1.    ]
 [119.       0.      24.       0.       1.     247.5208   1.       0.
    0.       0.       1.       1.       0.       0.    ]
 [626.       0.      61.       0.       0.      32.3208   1.       0.
    0.       0.       1.       0.       0.       1.    ]]
X_test:
 [[424.      0.     28.      1.      1.     14.4     0.      0.      1.
    1.      0.      0.      0.      1.   ]
 [179.      0.     30.      0.      0.     13.      0.      1.      0.
    0.      1.      0.      0.      1.   ]
 [306.      1.      0.92    1.      2.    151.55    1.      0.      0.
    0.      1.      0.      0.      1.   ]
 [2