# Build a Logistic Regression model
* 1. Start with the Pclass column only (because it is numerical and complete).
* 2. Print the coefficients calculated by the model.
` m.coef_, m.intercept_`
* 3. Calculate the probabilities for your data points belonging to the positive class.
` m.predict_proba(X) `
* 4. Suppose you classify all points with a probability > 0.9 as positive.
     * How does the result of your prediction change?
     * How does it change if you change the threshold to > 0.1?

### Step 1

In [4]:
# Import the necessary packages
import pandas as pd

# Import the logistic regression
from sklearn.linear_model import LogisticRegression

In [5]:
# Import the dataset
df = pd.read_csv('/Users/braulio/Documents/data/train.csv', index_col=0)
df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
type(df[['Pclass']]), type(df['Pclass'])

(pandas.core.frame.DataFrame, pandas.core.series.Series)

In [7]:
# Define X and y

X = df[['Pclass']] # scikit-learn expects pd.DataFrame
y = df['Survived'] # scikit-learn expects pd.Series

In [8]:
# Split the data into a training set and a test set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10)

X_test.shape, y_test.shape

((223, 1), (223,))

In [9]:
# Create a model
m = LogisticRegression(random_state=10)

In [10]:
# Train a model
m.fit(X_train, y_train) # <-- this is the process of finding parameters

LogisticRegression(random_state=10)

### Step 2

In [11]:
# Parameters/coefficients (b and w)

w = m.coef_[0]
w

array([-0.80400439])

In [12]:
b = m.intercept_
b

array([1.40871638])

### Step 3

In [27]:
m.predict_proba(X)

array([[0.73170627, 0.26829373],
       [0.35326641, 0.64673359],
       [0.73170627, 0.26829373],
       ...,
       [0.73170627, 0.26829373],
       [0.35326641, 0.64673359],
       [0.73170627, 0.26829373]])

In [17]:
# Use the model to make predictions on the seen data
ypred_train = m.predict(X_train)

In [23]:
X_test_probabilities = m.predict_proba(X_test)[:,1]

#Selects the probability that Survival =1 for each datapoint in X_test
X_test_probabilities

array([0.26829373, 0.26829373, 0.26829373, 0.64673359, 0.64673359,
       0.26829373, 0.26829373, 0.45034115, 0.26829373, 0.26829373,
       0.45034115, 0.45034115, 0.45034115, 0.64673359, 0.45034115,
       0.45034115, 0.26829373, 0.26829373, 0.26829373, 0.26829373,
       0.26829373, 0.45034115, 0.45034115, 0.26829373, 0.26829373,
       0.26829373, 0.26829373, 0.64673359, 0.45034115, 0.64673359,
       0.64673359, 0.45034115, 0.26829373, 0.26829373, 0.26829373,
       0.26829373, 0.26829373, 0.64673359, 0.26829373, 0.26829373,
       0.26829373, 0.45034115, 0.26829373, 0.26829373, 0.64673359,
       0.64673359, 0.26829373, 0.26829373, 0.64673359, 0.64673359,
       0.64673359, 0.45034115, 0.26829373, 0.64673359, 0.45034115,
       0.64673359, 0.26829373, 0.26829373, 0.64673359, 0.26829373,
       0.26829373, 0.64673359, 0.26829373, 0.26829373, 0.26829373,
       0.26829373, 0.45034115, 0.26829373, 0.26829373, 0.45034115,
       0.45034115, 0.26829373, 0.26829373, 0.26829373, 0.45034

### Step 4

In [29]:
ypred_train.mean() 

0.2410179640718563

In [34]:
(X_test_probabilities>=0.5).mean() 
# similar results

0.24663677130044842

In [33]:
# Here we change the selection criteria:
(X_test_probabilities>0.9).mean() 
# returns 0 survivors

0.0

In [35]:
(X_test_probabilities>0.1).mean() 
# returns 100% survivors

1.0