In [1]:
#1) import relevant libraries and stuff
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer, LabelBinarizer
from sklearn.pipeline import make_pipeline, make_union

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option("display.max_rows", 100)

In [3]:
#2) import train and test data
aid_train = pd.read_csv('university_train.csv')
aid_test = pd.read_csv('university_test.csv')

In [4]:
#3) operationalize y (y =
target_train = aid_train["percent_on_student_loan"]

In [5]:
#4) write all my definitions

#column returns
def female_col(df):
    return df["FEMALE"].values.reshape(-1, 1)

def faminc_col(df):
    return df["MD_FAMINC"].values.reshape(-1, 1)

def control_col(df):
    return df["CONTROL"].values.reshape(-1, 1)

def locale_col(df):
    return df["LOCALE"].values.reshape(-1, 1)

def state_col(df):
    return df["STABBR"].values.reshape(-1, 1)

def ccugprof_col(df):
    return df["CCUGPROF"].values.reshape(-1, 1)

def preddeg_col(df):
    return df["PREDDEG"].values.reshape(-1, 1)

def highdeg_col(df):
    return df["HIGHDEG"].values.reshape(-1, 1)

def drop_col(dummies_df):
    return dummies_df[:,:-1]

In [6]:
#5) make pipelines to clean/transform data column by column for x 
female_pipe = make_pipeline(FunctionTransformer(female_col, validate=False))
faminc_pipe = make_pipeline(FunctionTransformer(faminc_col, validate=False))
control_pipe = make_pipeline(FunctionTransformer(control_col, validate=False), 
	LabelBinarizer(), FunctionTransformer(drop_col, validate=False))
locale_pipe = make_pipeline(FunctionTransformer(locale_col, validate=False),
	LabelBinarizer(), FunctionTransformer(drop_col, validate=False))
state_pipe = make_pipeline(FunctionTransformer(state_col, validate=False),
	LabelBinarizer(), FunctionTransformer(drop_col, validate=False))
ccugprof_pipe = make_pipeline(FunctionTransformer(ccugprof_col, validate=False),
	LabelBinarizer(), FunctionTransformer(drop_col, validate=False))
preddeg_pipe = make_pipeline(FunctionTransformer(preddeg_col, validate=False),
	LabelBinarizer(), FunctionTransformer(drop_col, validate=False))
highdeg_pipe = make_pipeline(FunctionTransformer(highdeg_col, validate=False),
	LabelBinarizer(), FunctionTransformer(drop_col, validate=False))

In [7]:
#6) create X with make_union
union = make_union(female_pipe, faminc_pipe, control_pipe, locale_pipe, state_pipe, 
                   ccugprof_pipe, preddeg_pipe, highdeg_pipe)
union.fit_transform(aid_train)

array([[  7.06852792e-01,   1.95885434e+04,   1.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00],
       [  7.32673267e-01,   1.81576697e+04,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  5.43689320e-01,   5.04378394e+04,   0.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       ..., 
       [  5.87945879e-01,   1.98868241e+04,   1.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00],
       [  9.19117647e-01,   2.11323314e+04,   0.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  9.54625293e-01,   2.11292344e+04,   0.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00]])

In [8]:
len(union.fit_transform(aid_train)[0])

96

In [9]:
#7) fit train to a model 
lr = LinearRegression()
lr.fit(union.fit_transform(aid_train), target_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
lr.score(union.fit_transform(aid_train), target_train)

0.54876649765320495

In [11]:
union.transform(aid_test)

array([[  6.00301040e-01,   3.21199462e+04,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  5.63186813e-01,   1.89076581e+04,   1.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  4.41208199e-01,   3.01987783e+04,   1.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00],
       ..., 
       [  5.30716345e-01,   6.91647628e+04,   1.00000000e+00, ...,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  4.93223620e-01,   1.46061979e+04,   0.00000000e+00, ...,
          1.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  5.66319972e-01,   2.02854816e+04,   1.00000000e+00, ...,
          0.00000000e+00,   1.00000000e+00,   0.00000000e+00]])

In [12]:
len(union.transform(aid_test)[0])

96

In [13]:
#8) predict the model with the test dataset
predictions = lr.predict(union.transform(aid_test))

In [14]:
#9) export results
def evaluation_transformation(dataset, predictions):
    dataset = dataset.join(pd.DataFrame(predictions, columns=['Prediction']))
    dataset[['id_number', 'Prediction']].to_csv('submission_aid.csv', index=False)

evaluation_transformation(aid_test, predictions)