In [0]:
#First, load packages
import pandas as pd


In [0]:
#Load the data
train = pd.read_csv("train.csv")
train_shape = train.shape
print(train_shape)

In [0]:
#Another way to look at the shape
train.shape

In [0]:
#To look at all the variables for the first 10 observations
train.head(10)

In [0]:
##Data exploration 1: Building Tables
import matplotlib.pyplot as plt

sex_pivot = train.pivot_table(index="Sex",values="Survived")
sex_pivot

In [0]:
##Data exploration 2: Building bar charts
pclass_pivot = train.pivot_table(index="Pclass",values="Survived")
# pclass_pivot
pclass_pivot.plot.bar()
plt.show()

In [0]:
##Data exploration 3: Descriptive statistics
train['Age'].describe()

In [0]:
##Data exploration 4: counts
train['Pclass'].value_counts()

In [0]:
##Creating dummy variables
column_name = "Pclass"
df = train
dummies = pd.get_dummies(df[column_name],prefix=column_name)
dummies.head()

In [0]:
##Building a function to create dummy variables
def create_dummies(df,column_name):
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

train = create_dummies(train,"Pclass")
train.head()

In [0]:
##Building a function and creating categorical variables 1
def process_age(df,cut_points,label_names):
    df["Age"] = df["Age"].fillna(-0.5)
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

cut_points = [-1,0, 5, 12, 18, 35, 60, 100]
label_names = ["Missing", 'Infant', "Child", 'Teenager', "Young Adult", 'Adult', 'Senior']

train = process_age(train,cut_points,label_names)

In [0]:
##Look at the categories you just made
age_cat_pivot = train.pivot_table(index="Age_categories",values="Survived")
age_cat_pivot.plot.bar()
plt.show()

In [0]:
##Create dummies using the function you just made
train = create_dummies(train,"Sex")
train = create_dummies(train,"Age_categories")

In [0]:
##Let's see how the data looks now
train.head()

In [0]:
##package to create machine learning models
from sklearn.linear_model import LogisticRegression

In [0]:
##Create our data sets

from sklearn.model_selection import train_test_split

columns = ['Pclass_1', 'Pclass_2', 'Pclass_3', 'Sex_female', 'Sex_male',
       'Age_categories_Missing','Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior']

all_X = train[columns]
all_y = train['Survived']

train_X, test_X, train_y, test_y = train_test_split(
    all_X, all_y, test_size=0.2,random_state=0)

In [0]:
##Fit our regression tree and produce an accuracy score
lr = LogisticRegression()
lr.fit(train_X, train_y)
predictions = lr.predict(test_X)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test_y, predictions)

In [0]:
## How did we do?
accuracy

In [0]:
## Cross validation scores
from sklearn.model_selection import cross_val_score
import numpy as np

lr = LogisticRegression()
scores = cross_val_score(lr, all_X, all_y, cv=10)

In [0]:
## All the scores
scores


In [0]:
## This will be the final value we use to test with

np.mean(scores)