In [246]:
import os 
import sys
import numpy as np
import gzip
import simplejson as json
from sklearn.cross_validation import train_test_split
import pandas as pd
import dill
import sklearn.base as base
from sklearn.grid_search import GridSearchCV

- this code trains yelp data to predict start ratings for businesses

In [None]:
gz = gzip.open('miniprojects/questions/data/ml/yelp_train_academic_dataset_business.json.gz')
M = []
for i in gz.readlines():
    M.append(i)
#get a test and train set
indices = np.random.permutation(xrange(len(M)))
M_random_order = list( M[i] for i in indices)
y_random_order = list( M[i] for i in indices)
M = M_random_order
y = y_random_order
M_train, M_test, y_train, y_test = train_test_split(M_random_order, y_random_order, test_size=0.05)

In [127]:
#Establish a baseline
class MeanCityEstimator(base.BaseEstimator, base.RegressorMixin):
    def __init__(self):
        self.cities= {}
        # initialization code

    def fit(self, X, y):
        # fit the model ...
        r = pd.DataFrame(X,columns=['city'])
        r = r.groupby('city')
        cities = r.groups.keys()
        for i in cities:
            index = r.get_group(i).index
            self.cities[i] = float(pd.DataFrame(y).loc[index].mean())
        return self

    def predict(self, X):
        p_city = []
        for city in X:
            try:
                p_city.append(self.cities[city])
            except:
                #print 'no city', city
                p_city.append(3.75)
        return p_city
    

Column Transformer Classes

In [40]:
#Select one column from json record and return an array
class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, column_name=''):
        # initialization code
        self.column_name = column_name

    def fit(self, X, y=None):
        # fit the transformation ...
        return self

    def transform(self, X):
        A = []
        for i in X:
            try:
                s = json.loads(i)
            except:
                s= i
            A.append(s[self.column_name])
        return A

#Select two columns from json record and return an array [c1,c2]
class ColumnSelectTransformer2(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, column1_name='', column2_name=''):
        # initialization code
        self.column1_name = column1_name
        self.column2_name = column2_name
        pass

    def fit(self, X, y=None):
        # fit the transformation ...
        return self

    def transform(self, X):
        A = []
        for i in X:
            try:
                s = json.loads(i)
            except:
                s = i
            A.append([s[self.column1_name],s[self.column2_name]])
        return A
    

In [181]:
#Select one column and turn it into an unweighted dictionary of terms
class DictMassagerTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        # initialization code
        pass

    def fit(self, X, y=None):
        # fit the transformation ...
        return self

    def transform(self, X):
        A = []
        for p in X:
            B={}
            for key in p: 
                B[key]=1
            A.append(B)
        return A

    
#Select one column and turn it into an unweighted dictionary of terms
class NestFlattenTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        # initialization code
        pass

    def fit(self, X, y=None):
        # fit the transformation ...
        return self

    def transform(self, X):
        A = []
        for p in X:
            B={}
            for key in p: 
                if isinstance(p[key], dict):
                    for b_key in p[key]:
                        if p[key][b_key]:
                            B[key+'_'+b_key]=1
                        else:
                             B[key+'_'+b_key]=0
                else:
                    B[key]=1
            A.append(B)
        return A

    
#Take a model and add a transform method that returns a prediction
class ModelTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, model):
        # initialization code
        self.model = model

    def fit(self, X, y=None):
        # fit the transformation ...
        return self.model.fit(X,y)

    def transform(self, X, **transform_params):
        return self.model.predict(X)
    
    def score(self, X, y=None):
        # score the model ...
        return self.model.score(X,y)


In [214]:
#Take a model and add a transform method that returns a prediction
class ConcantTransformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self, num=3):
        self.num=num

    def fit(self, X, y=None):
        #fit the transformation
        return self

    def transform(self, X):
        #transform the data
        l = len(X)
        l_step = l/self.num
        A = []
        for i in range(l_step):
            B = []
            for j in range(self.num): 
                B.append(X[i+l_step*j])
            A.append(B)
        return A


Location-based model

In [357]:
from sklearn.neighbors import KNeighborsRegressor

y_train = ColumnSelectTransformer(column_name='stars').transform(M_train)
y_test = ColumnSelectTransformer(column_name='stars').transform(M_test)
y = ColumnSelectTransformer(column_name='stars').transform(M)

parameters = {'neigh__n_neighbors':range(40,75,2)}

pipeline = Pipeline([('cm', ColumnSelectTransformer2(column1_name='latitude', column2_name='longitude')),
               ('neigh',  KNeighborsRegressor())])

grid_search = GridSearchCV(pipeline,  param_grid=parameters, cv= 3)
grid_search.fit(M,y)

dill.dump(grid_search, open('../lib/ml/pipeline_long.dll','w'))
print grid_search.score(M_test,y_test)
print grid_search.best_params_

0.0524583772093
{'neigh__n_neighbors': 62}


In [None]:
for i in range(1,75,5):
    pipeline = Pipeline([('cm', ColumnSelectTransformer2(column1_name='latitude', column2_name='longitude')),
                   ('neigh',  KNeighborsRegressor(n_neighbors=i))])
    pipeline.fit(M_train,y_train)
    print pipeline.score(M_test,y_test), i

In [336]:
#Train on categories
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline


y = ColumnSelectTransformer(column_name='stars').transform(M)
pi = Pipeline([('cm', ColumnSelectTransformer('categories')),
               ('dm', DictMassagerTransformer()),
               ('dv', DictVectorizer()),
               ('ln', LinearRegression())])

pi.fit(M,y)
dill.dump(pi, open('../lib/ml/pipeline_categories.dll','w'))
pi.score(M_test,y_test)          

0.19698694875714862

In [355]:
#train on attributes
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline


y = ColumnSelectTransformer(column_name='stars').transform(M)

pi = Pipeline([('cm', ColumnSelectTransformer('attributes')),
               ('dm', NestFlattenTransformer()),
               ('dv', DictVectorizer()),
               ('ln', LinearRegression())])

pi.fit(M,y)
prediction = pi.predict(M)
dill.dump(pi, open('../lib/ml/pipeline_attributes.dll','w'))
pi.score(M_test,y_test)          

0.068386414195693357

In [356]:
#Combine all models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion


y = ColumnSelectTransformer(column_name='stars').transform(M)
y_test = ColumnSelectTransformer(column_name='stars').transform(M_test)

combined_features  =   FeatureUnion([
    ('mean', Pipeline([     ('cm', ColumnSelectTransformer('city')),
                            ('dm', ModelTransformer(MeanCityEstimator()))])),
    ('latlong', Pipeline([  ('cm', ColumnSelectTransformer2(column1_name='latitude', column2_name='longitude')),
                            ('neigh',  ModelTransformer(KNeighborsRegressor(n_neighbors=62)))])),
    ('category' , Pipeline([('cm', ColumnSelectTransformer('categories')),
                            ('dm', DictMassagerTransformer()),
                            ('dv', DictVectorizer()),
                            ('ln',  ModelTransformer(LinearRegression()))])),
    ('attribute', Pipeline([('cm', ColumnSelectTransformer('attributes')),
                            ('dm', NestFlattenTransformer()),
                            ('dv', DictVectorizer()),
                            ('ln',  ModelTransformer(LinearRegression()))]))])
 

features = combined_features.fit(M,y)
pipeline = Pipeline([("concat", ConcantTransformer(num=4)),
                     ("ln", LinearRegression())])
full_model = pipeline.fit(features.transform(M),y)

dill.dump(full_model, open('../lib/ml/pipeline_full.dll','w'))
dill.dump(features, open('../lib/ml/features_full.dll','w'))
print full_model.score(features.transform(M_test),y_test)

0.221688109967
