In [238]:
import gzip, ujson, re, json
import pandas as pd
import numpy as np
from sklearn import cross_validation
from sklearn import metrics, base, neighbors, grid_search
import dill
import os
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn import linear_model
from sklearn.pipeline import Pipeline, FeatureUnion

import toolz
def pick(whitelist, dicts):
    return [toolz.keyfilter(lambda k: k in whitelist, d)
            for d in dicts]
# load data into a yelp(a list of dict)
yelp = []
with gzip.open('yelp_train_academic_dataset_business.json.gz', 'rb') as f:
    for line in f:
        yelp.append(ujson.loads(line))
X = yelp
y = [i['stars'] for i in yelp]

In [239]:
class Q2Transformer(base.BaseEstimator, base.TransformerMixin):
    def __init__(self):
        self.trans_=[]

    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        if type(X) is list:
            return [[i["latitude"], i["longitude"]] for i in X]
        else:
            return [X["latitude"], X["longitude"]]


In [240]:
class Q3Transformer(base.BaseEstimator, base.TransformerMixin):
    '''
        class variable: self.col; self.vectorizer
    '''
    def __init__(self):      
        self.col = 'categories' # initialize the column name

    def fit(self, X, y=None):
        # pick the column
        pick_category = pick(self.col, X)
        category_train = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))]
        
        # transform the training records
        self.vectorizer = TfidfVectorizer(min_df=1)  
        self.vectorizer.fit_transform(category_train)
        
        return self

    
    def transform(self, X):
        # transform the test record
        if type(X) is list:
            pick_category = pick(self.col, X)
            category_X = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))]
        else:
            category_X = [' '.join(X[self.col])]
        
        X_trans = self.vectorizer.transform(category_X)
        return X_trans 


In [241]:
class Q4Transformer(base.BaseEstimator, base.TransformerMixin):
    '''
        class variable: self.col; self.vectorizer
    '''
    def __init__(self):      
        self.col = 'attributes' # initialize the column name
   
    # flatten out dics of dicts
    def flatten_dict(self, Xdict):
        p_dict = Xdict.copy()
        for key in p_dict.keys():
            #print key, p_dict[key], type(p_dict[key])
            if type(p_dict[key]) == dict: 
                # son is a dict, flatten
                son_dict = self.flatten_dict(p_dict[key]).copy()
                for son_key in son_dict.keys():
                    son_dict[key+'_'+son_key] = son_dict.pop(son_key)
                del p_dict[key]
                p_dict.update(son_dict)
            
            elif type(p_dict[key]) in [unicode,str]:
                # son is a string, concatate to key
                son_str = p_dict[key]
                p_dict[key] = 1
                p_dict[key+'_'+son_str] = p_dict.pop(key)
            
            elif type(p_dict[key]) not in [bool, int, float]:    
                raise ValueError("type error in flatten_dict!")
        return p_dict


    def fit(self, X, y=None):
        # flatten the train dict
        attr_train = [self.flatten_dict(record[self.col]) for record in X]
        
        # transform the training records
        self.vectorizer = DictVectorizer(sparse=False)
        self.vectorizer.fit_transform(attr_train)
        return self

    def transform(self, X):
        # transform the test record
        if type(X) is list:
            attr_X = [self.flatten_dict(record[self.col]) for record in X]
        else:
            attr_X = self.flatten_dict(X[self.col])
        X_trans = self.vectorizer.transform(attr_X)
        return X_trans 


In [242]:
class Q5Transformer(base.BaseEstimator, base.TransformerMixin):
    '''
        class variable: self.col; self.vectorizer
    '''
    def __init__(self):      
        self.col = []

    def fit(self, X, y=None):
        Trans2 = Q2Transformer()
        Trans3 = Q3Transformer()
        Trans4 = Q4Transformer()
        combined_features = FeatureUnion([("Q2", Trans2), ("Q3", Trans3), ("Q4", Trans4)])
        self.fit = combined_features.fit(X)
        return self

    def transform(self, X):
        return self.fit.transform(X).A
       
        
        
        
class EstimatorQ5(base.BaseEstimator, base.RegressorMixin):
    '''
        self.col, self.transformer, self.clf
    '''
    def __init__(self):
        Q5transformer = Q5Transformer()  # initialize
        self.transformer = Q5transformer

    def fit(self, X, y):
        # transform training data        
        self.transformer.fit(X)
        X_train = self.transformer.transform(X)
        
        # train the model
        clf = linear_model.RidgeCV(alphas=[0.1, 1.0, 10.0])
        self.clf = clf.fit(X_train, y)        
        return self.clf

    def predict(self, X):       
        X_test = self.transformer.transform(X) # transform test data
        
        return float(self.clf.predict(X_test)) # predice test result


In [243]:
Q5estimator = EstimatorQ5()  # initialize
Q5estimator.fit(X, y)  # fit data

RidgeCV(alphas=[0.1, 1.0, 10.0], cv=None, fit_intercept=True, gcv_mode=None,
    normalize=False, scoring=None, store_cv_values=False)

In [244]:
dill.dump(Q5estimator, open("myQ5", "w"))