In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset=pd.read_csv('train.csv')
dataset.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


## Calculating the Null Values per Column

In [3]:
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

## Dropping the non required Columns

Like Passenger ID, Name, Ticket, Cabin

In [6]:
dataset_1=dataset.drop(['PassengerId','Name','Ticket','Cabin'],axis=1)
dataset_1.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## Dropping the Columns with NaN values

In [8]:
dataset_1.dropna(inplace=True)
dataset_1.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [9]:
dataset_1.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,712.0,712.0,712.0,712.0,712.0,712.0
mean,0.404494,2.240169,29.642093,0.514045,0.432584,34.567251
std,0.491139,0.836854,14.492933,0.930692,0.854181,52.938648
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,1.0,20.0,0.0,0.0,8.05
50%,0.0,2.0,28.0,0.0,0.0,15.64585
75%,1.0,3.0,38.0,1.0,1.0,33.0
max,1.0,3.0,80.0,5.0,6.0,512.3292


## Splitting the Dataset into X and y Matrix

In [10]:
X=dataset_1.iloc[:,1:].values
y=dataset_1.iloc[:,0:1].values

In [11]:
X[0:5,:]

array([[3, 'male', 22.0, 1, 0, 7.25, 'S'],
       [1, 'female', 38.0, 1, 0, 71.2833, 'C'],
       [3, 'female', 26.0, 0, 0, 7.925, 'S'],
       [1, 'female', 35.0, 1, 0, 53.1, 'S'],
       [3, 'male', 35.0, 0, 0, 8.05, 'S']], dtype=object)

## Its time to deal with Categorical Features

In [12]:
from sklearn.preprocessing import LabelEncoder
LE_Sex = LabelEncoder()
X[:,1]=LE_Sex.fit_transform(X[:,1])

In [13]:
X[0:5,:]

array([[3, 1, 22.0, 1, 0, 7.25, 'S'],
       [1, 0, 38.0, 1, 0, 71.2833, 'C'],
       [3, 0, 26.0, 0, 0, 7.925, 'S'],
       [1, 0, 35.0, 1, 0, 53.1, 'S'],
       [3, 1, 35.0, 0, 0, 8.05, 'S']], dtype=object)

In [14]:
LE_Emb=LabelEncoder()
X[:,6]=LE_Emb.fit_transform(X[:,6])

In [15]:
X[0:5,:]

array([[3, 1, 22.0, 1, 0, 7.25, 2],
       [1, 0, 38.0, 1, 0, 71.2833, 0],
       [3, 0, 26.0, 0, 0, 7.925, 2],
       [1, 0, 35.0, 1, 0, 53.1, 2],
       [3, 1, 35.0, 0, 0, 8.05, 2]], dtype=object)

In [16]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(categorical_features=[0,6])
X=ohe.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [17]:
X[0:5,:]

array([[ 0.    ,  0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  1.    ,
        22.    ,  1.    ,  0.    ,  7.25  ],
       [ 1.    ,  0.    ,  0.    ,  1.    ,  0.    ,  0.    ,  0.    ,
        38.    ,  1.    ,  0.    , 71.2833],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  0.    ,
        26.    ,  0.    ,  0.    ,  7.925 ],
       [ 1.    ,  0.    ,  0.    ,  0.    ,  0.    ,  1.    ,  0.    ,
        35.    ,  1.    ,  0.    , 53.1   ],
       [ 0.    ,  0.    ,  1.    ,  0.    ,  0.    ,  1.    ,  1.    ,
        35.    ,  0.    ,  0.    ,  8.05  ]])

## Splitting into train and test set

In [18]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)
X_test[0:5,:]

array([[  0.   ,   0.   ,   1.   ,   0.   ,   0.   ,   1.   ,   0.   ,
         28.   ,   1.   ,   1.   ,  14.4  ],
       [  0.   ,   1.   ,   0.   ,   0.   ,   0.   ,   1.   ,   1.   ,
         30.   ,   0.   ,   0.   ,  13.   ],
       [  1.   ,   0.   ,   0.   ,   0.   ,   0.   ,   1.   ,   1.   ,
          0.92 ,   1.   ,   2.   , 151.55 ],
       [  0.   ,   1.   ,   0.   ,   1.   ,   0.   ,   0.   ,   1.   ,
         36.   ,   0.   ,   0.   ,  12.875],
       [  0.   ,   0.   ,   1.   ,   0.   ,   0.   ,   1.   ,   1.   ,
         47.   ,   0.   ,   0.   ,   7.25 ]])

## Feature Scaling

In [20]:
from sklearn.preprocessing import StandardScaler
sc_X=StandardScaler()
X_train=sc_X.fit_transform(X_train)
X_test=sc_X.transform(X_test)

# Time to fit the Logistic Regression

In [21]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state=0)
classifier.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=0, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [22]:
y_pred=classifier.predict(X_test)

In [23]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,y_pred)
cm

array([[81,  1],
       [55,  6]])

In [None]:
y