## Activity 1: Implement PropensityNet 

## Import the libraries

In [29]:
import pandas as pd
from scipy.stats import uniform
import numpy as np 
from sklearn.model_selection import train_test_split
import tensorflow as tf 
from tensorflow.keras.layers import Dropout
import matplotlib.pylab as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn import metrics
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

## Generate data 

### Create Treatment Data

In [2]:
"""
We generate treatment data using a random sample.
X_1 and X_2 are two covarients. Y_t is the outcome.
and W_t is the treatment assignment.
"""

x_1_t = uniform.rvs(loc=5, scale=1, size=1000).reshape(-1,1)
x_2_t = uniform.rvs(loc=1, scale=0, size=1000).reshape(-1,1)
y_T = uniform.rvs(loc=8, scale=1, size=1000).reshape(-1,1)
w_t = np.ones(1000).reshape(-1,1)


In [3]:
#combine all of the columns to form a dataframe
treament_df = pd.DataFrame(np.concatenate([x_1_t,x_2_t,y_T,w_t], axis=1), columns=['X_1', 'X_2', 'Y', 'W'])

In [4]:
treament_df.head(5)

Unnamed: 0,X_1,X_2,Y,W
0,5.761255,1.0,8.715843,1.0
1,5.926276,1.0,8.379799,1.0
2,5.332234,1.0,8.796563,1.0
3,5.948696,1.0,8.219883,1.0
4,5.16627,1.0,8.618104,1.0


### Create Control Data

In [35]:
#drop the treatment assignment column
control_data = treament_df.copy()
control_data = treament_df.drop(['W'], axis=1)

In [6]:
#add noise to the treatment data to generate control data
mu, sigma = 0, 0.4
noise = np.random.normal(mu, sigma, [1000,3]) 
control_data = control_data + noise

In [37]:
control_data["W"] = (np.zeros(1000).reshape(-1,1))

In [38]:
control_data.head(5)


Unnamed: 0,X_1,X_2,Y,W
0,5.761255,1.0,8.715843,0.0
1,5.926276,1.0,8.379799,0.0
2,5.332234,1.0,8.796563,0.0
3,5.948696,1.0,8.219883,0.0
4,5.16627,1.0,8.618104,0.0


In [12]:
#Combine both the control and the test data
frames = [control_data, treament_df]
full_data = pd.concat(frames)

In [14]:
df = full_data.copy()

## Create Traning and Test sets

In [45]:
X = df[df.columns[0:3]]
Y = df[df.columns[3]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [46]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(10, input_shape=(10,3), activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_11 (Dense)             (None, 10, 10)            40        
_________________________________________________________________
dropout_8 (Dropout)          (None, 10, 10)            0         
_________________________________________________________________
dense_12 (Dense)             (None, 10, 10)            110       
_________________________________________________________________
dropout_9 (Dropout)          (None, 10, 10)            0         
_________________________________________________________________
dense_13 (Dense)             (None, 10, 10)            110       
_________________________________________________________________
dropout_10 (Dropout)         (None, 10, 10)            0         
_________________________________________________________________
dense_14 (Dense)             (None, 10, 10)           

In [47]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train the Model

In [48]:
model.fit(X_train, y_train, epochs=100, batch_size=5)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<tensorflow.python.keras.callbacks.History at 0x7fbf66c34990>

## Evaluate the Model on a test set

In [54]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy of PropensityNet classifier on test set: %.2f' % (accuracy*100))

Accuracy of PropensityNet classifier on test set: 78.67


## Train a Logistic Regression Model 

In [51]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [52]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.67


## Task 1: The dataset used is has an equal number of control and test datapoints. Use a sample of the test data to generate the control data. This will generate inbalance in the dataset.

In [58]:
sample_size =  #specify a number between 0 and 1000 for the number of samples.
mu = 0
sigma = 0.4
control_data = treament_df.copy()
control_data = control_data.sample(n = sample_size, replace=True, random_state=1)
noise = np.random.normal(mu, sigma, [5,4]) 
control_data = control_data + noise
control_data["W"] = (np.zeros(sample_size).reshape(-1,1))

### Now, split your new data into train and test sets and re-train your models. Report the test accuracy for both models.

## Task 2: Mu is the mean and Sigma is the standard deviation of a Gaussian noise. Change the values of Mu and Sigma, and re-retrain your models. What do you observe?