In [1]:
import pandas as pd
import sklearn.base as base
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
# Models
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn import svm
from sklearn.decomposition import PCA
import dill

## Import all custom transformer and estimator methods

In [2]:
from transformers import *
from estimators import *

### Load the Data Set

In [4]:
train = pd.read_csv('data/train.txt')
test = pd.read_csv('data/test.txt')

#### Find the columns which are not numeric

In [5]:
for index, i in enumerate(train.dtypes):
    if i == object:
        print index, i

62 object
122 object
216 object
238 object


### A Model using linear regression

In [17]:
y_train = train.values[:,0]
y = y_train
M = train

#Create a pipeline to run linear regression on the data set
combined_features  =   FeatureUnion([
                                 ('reg_num', Pipeline([   ('cm', ColumnRemoveTransformer(remove_cols=[0, 62, 122, 216, 238])),
                                                        ('neigh',  ModelTransformer(LinearRegression()))
                                                    ])),
                                  ('reg_str', Pipeline([   ('cm', ColumnSelectTransformer(select_cols=[62, 122, 216, 238])),
                                                    ('dm',  DictMassagerTransformer()),
                                                    ('dv',  DictVectorizer()),
                                                    ('neigh',  ModelTransformer(LinearRegression()))
                                                     ]))
                                 ])

features = combined_features.fit(M, y)
pipeline = Pipeline([("concat", ConcantTransformer(num=2)),
                     ("ln", LinearRegression())])
full_model = pipeline.fit(features.transform(M),y)

print 'the score for this model is'
full_model.score(features.transform(M), y)

the score for this model is


0.60390226607745601

### A Model using support vector machines

In [6]:
y_train = train.values[:,0]
y = y_train
M = train

combined_features  =   FeatureUnion([
     ('reg_num', Pipeline([   ('cm', ColumnRemoveTransformer(remove_cols=[0, 62, 122, 216, 238])),
                            ('neigh',  ModelTransformer(svm.SVR()))
                        ])),
     ('reg_str', Pipeline([   ('cm', ColumnSelectTransformer(select_cols=[62, 122, 216, 238])),
                         ('dm',  DictMassagerTransformer()),
                         ('dv',  DictVectorizer()),
                         ('neigh',  ModelTransformer(svm.SVR()))
                         ]))
                                 ])
            
features = combined_features.fit(M, y)
pipeline = Pipeline([("concat", ConcantTransformer(num=2)),
                     ("ln", LinearRegression())])
full_model = pipeline.fit(features.transform(M),y)

print 'the score for this model is'
full_model.score(features.transform(M), y)

the score for this model is


0.68862520360204083

### Make Prediction on Test Set and write out the results

In [10]:
#full_model = pipeline.transform(features.transform(test))
dill.dump(full_model, open('lib/svm_model.dll','w'))
p = full_model.predict(features.transform(test))
with open('part1_prediction.dat','w') as out:
    for i in p:
        out.write('{}\n'.format(i))

###USING GRID CV TO TRAIN DIFFERENT PIPELINES

In [None]:
pipeline =  Pipeline([   ('cm', ColumnRemoveTransformer(remove_cols=[0, 62, 122, 216, 238])),
                            ('reg',  svm.SVR(kernel='linear'))
                        ])

grid_search = GridSearchCV(pipeline,  param_grid=parameters, cv= 3)
grid_search.fit(M,y)

print grid_search.score(M, y)
print grid_search.best_params_

In [65]:
pipeline =  Pipeline([   ('cm', ColumnSelectTransformer(select_cols=[62, 122, 216, 238])),
                         ('dm',  DictMassagerTransformer()),
                         ('dv',  DictVectorizer()),
                         ('reg',  svm.SVR())
                         ])

#parameters = {'reg__kernel':['rbf','linear','poly']} #linear is best
parameters = {'reg__C':[1,1e3,1e5]}
grid_search = GridSearchCV(pipeline,  param_grid=parameters, cv= 3)
grid_search.fit(M,y)

print grid_search.score(M, y)
print grid_search.best_params_

0.0949670203631
{'reg__C': 1}


In [9]:
pipeline =  Pipeline([   ('cm', ColumnRemoveTransformer(remove_cols=[0, 62, 122, 216, 238])),
                         ('pca', PCA()),      
                         ('reg',  svm.SVR())
                        ])
parameters = {'pca__n_components':[10, 50,100,200]} #linear is best
grid_search = GridSearchCV(pipeline,  param_grid=parameters, cv= 3)
grid_search.fit(M,y)

print grid_search.score(M, y)
print grid_search.best_params_

0.539898389211
{'pca__n_components': 200}


### See if PCA is helpful

In [19]:
pipeline =  Pipeline([   ('cm', ColumnRemoveTransformer(remove_cols=[0, 62, 122, 216, 238])),
                         ('pca', PCA()),
                         ('reg',  svm.SVR())
                        ])
parameters = {'pca__n_components':[10,25,100,125,150,200]} #linear is best
grid_search = GridSearchCV(pipeline,  param_grid=parameters, cv= 3)
grid_search.fit(M,y)

print grid_search.score(M, y)
print grid_search.best_params_

0.542064655406
{'pca__n_components': 125}


In [11]:
pipeline =  Pipeline([   ('cm', ColumnRemoveTransformer(remove_cols=[0, 62, 122, 216, 238])),   
                         ('reg',  svm.SVR())
                        ])
#parameters = {'pca__n_components':[10, 50,100,150]} #linear is best
grid_search = GridSearchCV(pipeline,  param_grid={}, cv= 3)
grid_search.fit(M,y)

print grid_search.score(M, y)
print grid_search.best_params_

0.54604973821
{}
