## Activity 1: Implement PropensityNet 

## Import the libraries

In [None]:
import pandas as pd
from scipy.stats import uniform
import numpy as np 
from sklearn.model_selection import train_test_split
import tensorflow as tf 
from tensorflow.keras.layers import Dropout
import matplotlib.pylab as plt
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import seaborn as sns
from sklearn import metrics
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

## Generate data 

### Create Treatment Data

In [None]:
"""
We generate treatment data using a random sample.
X_1 and X_2 are two covarients. Y_t is the outcome.
and W_t is the treatment assignment.
"""

x_1_t = uniform.rvs(loc=5, scale=1, size=1000).reshape(-1,1)
x_2_t = uniform.rvs(loc=1, scale=0, size=1000).reshape(-1,1)
y_T = uniform.rvs(loc=8, scale=1, size=1000).reshape(-1,1)
w_t = np.ones(1000).reshape(-1,1)


In [None]:
#combine all of the columns to form a dataframe
treament_df = pd.DataFrame(np.concatenate([x_1_t,x_2_t,y_T,w_t], axis=1), columns=['X_1', 'X_2', 'Y', 'W'])

In [None]:
treament_df.head(5)

### Create Control Data

In [None]:
#drop the treatment assignment column
control_data = treament_df.copy()
control_data = treament_df.drop(['W'], axis=1)

In [None]:
#add noise to the treatment data to generate control data
mu, sigma = 0, 0.4
noise = np.random.normal(mu, sigma, [1000,3]) 
control_data = control_data + noise

In [None]:
control_data["W"] = (np.zeros(1000).reshape(-1,1))

In [None]:
control_data.head(5)


In [None]:
#Combine both the control and the test data
frames = [control_data, treament_df]
full_data = pd.concat(frames)

In [None]:
df = full_data.copy()

## Create Traning and Test sets

In [None]:
X = df[df.columns[0:3]]
Y = df[df.columns[3]]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=42)

In [None]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(10, input_shape=(10,3), activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.Dense(10, activation='relu'))
model.add(Dropout(0.3))
model.add(tf.keras.layers.Dense(2, activation='softmax'))
model.summary()

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

## Train the Model

In [None]:
model.fit(X_train, y_train, epochs=100, batch_size=5)


## Evaluate the Model on a test set

In [None]:
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy of PropensityNet classifier on test set: %.2f' % (accuracy*100))

## Train a Logistic Regression Model 

In [None]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

## Task 1: The dataset used is has an equal number of control and test datapoints. Use a sample of the test data to generate the control data. This will generate inbalance in the dataset.

In [None]:
sample_size =  #specify a number between 0 and 1000 for the number of samples.
mu = 0
sigma = 0.4
control_data = treament_df.copy()
control_data = control_data.sample(n = sample_size, replace=True, random_state=1)
noise = np.random.normal(mu, sigma, [5,4]) 
control_data = control_data + noise
control_data["W"] = (np.zeros(sample_size).reshape(-1,1))

### Now, split your new data into train and test sets and re-train your models. Report the test accuracy for both models.

## Task 2: Mu is the mean and Sigma is the standard deviation of a Gaussian noise. Change the values of Mu and Sigma, and re-retrain your models. What do you observe?