# For Midterm Project: An Example with Standard Scaler

In [4]:
import numpy as np
import pandas as pd

In [5]:
titanic = pd.read_csv("Data/Titanic.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
# Extract numerical columns
numerical_cols = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
data = titanic[numerical_cols]
data.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare
0,3,22.0,1,0,7.25
1,1,38.0,1,0,71.2833
2,3,26.0,0,0,7.925
3,1,35.0,1,0,53.1
4,3,35.0,0,0,8.05


In [7]:
# Are there missing values?
data.isnull().sum()

Pclass      0
Age       177
SibSp       0
Parch       0
Fare        0
dtype: int64

In [11]:
# Use average age for the missing ages
data.loc[:, 'Age'] = data.loc[:, 'Age'].fillna(data['Age'].mean())
data.isnull().sum()

Pclass    0
Age       0
SibSp     0
Parch     0
Fare      0
dtype: int64

In [12]:
# Build a linear svm on the numerical variables
from sklearn.svm import LinearSVC
survival = titanic['Survived']
model = LinearSVC()
model.fit(data, survival)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [13]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(survival, model.predict(data))
print("Accuracy on training set:", accuracy)

Accuracy on training set: 0.6206509539842873


In [15]:
# Scale the data and see if it improves accuracy
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)
data_scaled.shape

(891, 5)

In [20]:
data_scaled

array([[ 0.82737724, -0.5924806 ,  0.43279337, -0.47367361, -0.50244517],
       [-1.56610693,  0.63878901,  0.43279337, -0.47367361,  0.78684529],
       [ 0.82737724, -0.2846632 , -0.4745452 , -0.47367361, -0.48885426],
       ...,
       [ 0.82737724,  0.        ,  0.43279337,  2.00893337, -0.17626324],
       [-1.56610693, -0.2846632 , -0.4745452 , -0.47367361, -0.04438104],
       [ 0.82737724,  0.17706291, -0.4745452 , -0.47367361, -0.49237783]])

In [16]:
model2 = LinearSVC()
model2.fit(data_scaled, survival)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [17]:
accuracy = accuracy_score(survival, model2.predict(data_scaled))
print("Accuracy on training set:", accuracy)

Accuracy on training set: 0.7048260381593715


In [18]:
# What about Logistic Regression?
from sklearn.linear_model import LogisticRegression
model_log = LogisticRegression()
model_log.fit(data_scaled, survival)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [19]:
accuracy = accuracy_score(survival, model_log.predict(data_scaled))
print("Accuracy on training set:", accuracy)

Accuracy on training set: 0.7059483726150393


# Week 10 Introduction to Neural Networks

Slides can be found [here](https://drive.google.com/file/d/1Ae5ancx-CW1eah51cNgKzfu87VDUTK_n/view?usp=sharing)

# Getting Started with TensorFlow 2

## Installation

### Create a virtual environment in Anaconda

Since TensorFlow requires many dependencies with specific versions, it is best pratice to install tensorflow in a clean python environment.

- Open Anaconda Navigator
- Go to "Environments"
- Click "Create", name the new environment with something like "tensorflow"


## Install TensorFlow 2

We will use Python's package management software `pip` to install TensorFlow
- Start "Anaconda Prompt (tensorflow)"
- Follow instructions from [TensorFlow Website](https://www.tensorflow.org/install/pip)

# Build a Classifier for Hand-Written Digits

Adapted from [TensorFlow tutorial](https://www.tensorflow.org/tutorials/quickstart/beginner)

1. Build a neural network that classifies images.
2. Train this neural network.
3. Evaluate the accuracy of the model.

In [None]:
# import tensorflow
import numpy as np
# import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
print(tf.__version__)

# Make numpy values easier to read.
np.set_printoptions(precision=3, suppress=True)

In [None]:
# Load and prepare the MNIST dataset.
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

# Convert the data from integers to floating-point numbers
x_train, x_test = x_train / 255.0, x_test / 255.0

print(x_train.shape, x_test.shape)

In [None]:
# Build a neural network model by stacking layers.
model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)                                   
])

In [None]:
# For each example the model returns a vector of "logits", one for each class.
index = 1234
predictions = model(x_train[index:(index+1)]).numpy()
print(predictions)

In [None]:
# The tf.nn.softmax function converts these logits to probabilities for each class
probs = tf.nn.softmax(predictions).numpy()
print(probs)

In [None]:
# The model makes prediction based on the largest probability
class_prediction = np.argmax(probs)
print(class_prediction)

In [None]:
# Visualize this image
plt.imshow(x_train[index].reshape([28, 28]), cmap=plt.cm.binary)

In [None]:
# The prediction accuracy is low, since no training has been performed yet.
# Let's introduce a function that measures the prediction error.
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
loss_fn(y_train[index:(index+1)], predictions).numpy()

In [None]:
# Set up the training environment
model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])

In [None]:
# The Model.fit method adjusts the model parameters to minimize the loss
model.fit(x_train, y_train, epochs=5)

In [None]:
# The above loss and accuracy is for the training data. Let's evaluate the model on the test set.
model.evaluate(x_test, y_test)

In [None]:
# The image classifier is not trained to ~98% accuracy on this dataset.
# Let's create a test case ourselves.
# I use MS Paint to draw a digit. Remember to resize the canvas to 28*28 pixels


In [None]:
# import pillow
import PIL
img_file = # Enter the name of your file
img = PIL.Image.open(img_file)
img = img.convert('1') # convert image to black and white
print(img.size)

In [None]:
img

In [None]:
img_np = 1 - np.asarray(img).astype(float)

In [None]:
predictions = model(img_np.reshape([-1, 28, 28])).numpy()
probs = tf.nn.softmax(predictions).numpy()
class_prediction = np.argmax(probs)
print(class_prediction)

In [None]:
probs

In [None]:
plt.imshow(img_np, cmap=plt.cm.binary)