# Baseline_model.ipynb

### setting up features

In [19]:
import pandas as pd
import numpy as np
import scipy
from scipy.sparse import csc_matrix
from sklearn.preprocessing import LabelEncoder 
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score

pd.set_option('precision', 5)



In [2]:
# LOAD IN CLEANED DATASET

path = '/home/emilyboeke/'
master_app = pd.read_csv(path + 'merged2_master_app.csv', low_memory=False)

In [3]:
# ADDITIONAL LAST MINUTE DATA CLEANING

# forgot to output excluding the auto-generated index column.... so now I'm dropping it. Can be optimized later
master_app = master_app.drop(master_app.columns[0], axis=1)

# change dec to binary number, so we can do summary stats on it
master_app.loc[(master_app["dec"] == 'DENY'),'dec'] = 0
master_app.loc[(master_app["dec"] == 'GRANT'),'dec'] = 1

# delete NTA dates before 1984. 
master_app['osc_date'] = pd.to_datetime(master_app['osc_date'],infer_datetime_format = True)
master_app = master_app[master_app.osc_date.dt.year>1984]

#master_app.describe()

In [4]:
# selecting relevant features and target variable.
df = master_app[['osc_date', 'tracid', 'nat', 'dec']]
df.shape

#print(df.head(10))
#df.describe()

In [7]:
# CHANGE OSC_DATE TO CONTINUOUS NUMBER.

osc_date_cont = []
startdate = np.datetime64('1984-01-01') # earliest date. from which timedelta is calculated

# change osc_date to continuous number
for i in df.index:
    x = df.loc[i,'osc_date'] - startdate
    osc_date_cont.append(x.days)
    
osc_date_cont = np.array(osc_date_cont)

In [8]:
# ONE HOT ENCODE CATEGORICAL VARIABLES

# change string nationalities to integer categories 
le = LabelEncoder()
nat_int = le.fit_transform(df['nat'])
nat_int = np.reshape(nat_int,[len(nat_int),1])

# get N x 2 array of features of interest
feat_int = np.concatenate((df[['tracid']], nat_int), axis=1)
feat_int.shape

# get one hot encoder of features
enc = OneHotEncoder()
enc.fit(feat_int)  
enc.feature_indices_

# create sparse matrix of all observations in Compressed Sparse Row format
blah = enc.transform(feat_int)

In [9]:
# CONCATENATE ONE HOT ENCODED FEATURES WITH CONTINUOUS FEATURE

# changes csr to csc, bc simpler to work with columns than rows
blah = scipy.sparse.csr_matrix.tocsc(blah)

# concatenating relevant fields
new_data = np.concatenate((blah.data, osc_date_cont)) # non-zero values in matrix
new_indices = np.concatenate((blah.indices, range(len(osc_date_cont)))) # row indices for each column
new_ind_ptr = np.append(blah.indptr, blah.indptr[-1]+len(osc_date_cont))

# making new matrix
final_feature_mat = csc_matrix((new_data, new_indices, new_ind_ptr))

### splitting into train, validation, and test data sets

In [21]:
# set random seed + shuffle data
np.random.seed(0)
# !! NEED TO PUT IN SHUFFLING CODE !!

# take out last 20% for test
idx = round(final_feature_mat.shape[0] * .8)
X_test_donttouch = final_feature_mat[idx:]
y_test_donttouch = df['dec'][idx:]

# define rest of data that we will be fitting/validating
X = final_feature_mat[0:idx]
y = df['dec'][0:idx]

### implementing logistic regression with ridge penalty

In [20]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print(scores)
print(scores.mean())

[0.68765487 0.67325328 0.71465786 0.67490522 0.68475337 0.7080227
 0.70217723 0.71973484 0.73701154 0.74605528]
0.7048226191168687
