## Part II: Predictive Model

In [29]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import MiniBatchKMeans
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.cross_validation import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin
from bs4 import BeautifulSoup 
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import RegexpTokenizer
import random
from collections import defaultdict
%matplotlib inline

In [6]:
# variables
random_state = 2017

In [3]:
# build the transformers
class VariableExtractor(TransformerMixin):
    '''Extract variable(s).'''    
    def _init_(self, variables):
        self.variables = variables
        
    def fit(self):
        return self
    
    def transform(self, dataset):
        return dataset[self.features]




In [4]:
class RatioCreator(TransformerMixin):
    '''Create new variable as the ratio of two variables.'''
    def _init_(self, variable1, variable2):
        self.variable1 = variable1
        self.variable2 = variable2
        
    def fit(self):
        return self
    
    def transform(dataset):
        return dataset[variable1]/dataset[variable2].apply(lambda x: x if x != 0 else 1)

In [5]:
class DimOneUp(TransformerMixin):
    '''Turn Series into array with 2 dimensions'''
    
    def fit(self):
        return self
    
    def transform(self, series):
        return series.reshape((series.shape[0], 1))
        

In [7]:
# cluster longitutide and latitute
class LLCluster(TransformerMixin):
    '''Cluster longitude and latitude.'''
    def _init_(self, n_cluster, *args):
        self.model = MiniBatchKMeans(n_cluster=n_cluster, *args)
        
    def fit(self, dataset):
        self.model.fit(dataset)
        return self
    
    def transform(self, dataset):
        return self.model.predict(dataset)
 

In [8]:
class VariableLength(TransformerMixin):
    '''Get the length of the variable when it is a list.'''
    def _init_(self, variable):
        self.feature = variable
    
    def fit(self):
        return self
    
    def transform(self, dataset):
        return dataset[variable].apply(length)

In [None]:
class FeatureCleanser(TransformerMixin):
    '''Clean the features
       Typical features in the data set: ['featureA', 'featureB']
       But some features are like ['featureA**featureB'].
       Turn those features into ['featureA', 'featureB']
    '''
    
    def _init_(self, spliter=['*', '.', '^']):
        self.spliter = spliter
    
    def fit(self):
        return self
    
    def transform(self, dataset):
        return dataset['features'].apply(self.feature_clean)
            
    def feature_clean(self, feature_list):
        '''Clean the features.'''
        for ff in feature_list: 
            if any(x in ff for x in self.spliter):
                feature_list.remove(ff)
                ff = re.sub('[{}]+'.format('|'.join(self.spliter)), ',', ff)
                #ff = re.sub('[*|.|^]+', ',', ff)
                # remove the ',' at the beginning and at the end of the string
                ff = re.sub('^[,]|[,]$', '', ff)
                feature_list += ff.split(',')
        # clean the text, strip and lower case
        return [f.strip().lower() for f in feature_list]



In [41]:
class DiffFeatCounts(TransformerMixin):
    '''For the Feature record, create a data set to count the most different features across classes.
    '''
    def _init_(self, sample_size, min_freq, n_iter=10, threshold=0.5, random_state=0):
        self.sample_size = sample_size
        self.min_freq = min_freq
        self.n_iter = n_iter
        self.threshold = threshold
        self.random_state = random_state
    
    def fit(self, dataset, y, *args):
        self.fit_set = dataset
        self.y = y
        self.diff_feat = self.find_DiffFeat()
        return self
    
    def transform(self, dataset):
        return dataset['features'].features.apply(self.feature_counts)

    def feature_counts(self, features):
        '''For each 'features' record, count the frequency of the different features in the feature record and
           create the data frame based on the counts. 
        '''
        feat_series = pd.Series([0]*length(self.diff_feat), index=self.difffeat_list)
        for f in self.diff_feat:
            feat_series[f] = features.count(f)
        return feat_series
    
    def find_DiffFeat(self):
        '''Find the most different features across the interest levels.
           Criteria: features appear > min_freq
                     any(%interest_level > threshold)
           Return a list of different features.                      
        '''
        random.seed(self.random_state)
        feature_df = default_dict(default_dict(0))
        # feature_df = {'featureA': {'low':30, 'medium':10, 'high':2}, 'featureB': ...}
        data_addy = self.fit_set.copy()
        data_addy['interest_level'] = self.y
        # Iterate the process. In each iteration, sample a subset with equal number of each interest level
        for n in range(self.n_iter):
            data_temp = pd.DataFrame(columns=['features', 'interest_level'])
            # for each interest level, sample equal size 
            for i in self.y.unique():
                data_temp = data_temp.append(data_addy[self.y==i].sample(n=self.sample_size))
            for ind in data_temp.index:
                for f in data_temp.loc[ind, 'features']:
                    feature_df[f][data_temp.loc[ind, 'interest_level']] += 1
                    
        diff_feat = [fk for fk, fv in feature_df.iter() if sum(fv.values()) >= self.min_freq 
                     and max(fv.values())/sum(fv.values()) >= self.threshold
                    ]
        return diff_feat

In [10]:
class DescriptionProcessor(TransformerMixin):
    '''Process the description.'''
    def _init_(self, *args):
        self.stemmer = snowball('english')
        self.tokenizer = RegexpTokenizer(r'\w+') #only keep the words, do not keep the punctuations
        self.vectorizer = TfidfVectorizer(*args, preprocessor=lambda p: self.preprocessor(p, stemmer=stemmer, tokenizer=tokenizer))
        
    def fit(self, description):
        self.vectorizer.fit(description)
        return self
    
    def transform(self, description):
        return self.vectorizer.transform(description).toarray()

    def preprocessor(self, text, stemmer, tokenizer):
        '''Preprocess the description'''
        # remove numbers
        text = re.sub('[0-9]*', '', text)
        #toknize the description
        text = ' '.join([self.stemmer.stem(x) for x in self.tokenizer.tokenize(text)])
        return text

    
    
    
    
    
    
    

In [11]:
class CatFeatureCounts(TransformerMixin):
    '''Count number of list for each categorical variable.'''       
    def fit(self, dataseries):
        self.catcounts = dataseries.value_counts
        return self
        
    def transform(self, dataseries):
        dataseries.apply(lambda x: self.catcounts[x] if x in self.catcounts.index else 0)
        
        
        

In [39]:
class CatFeatureIndicator(TransformerMixin):
    '''Indicate if the categorical variable has more low, medium or high.
       Criteria: Frequency of the category > min_list
                 For a category, the percent of any interest_level greater than the corresponding threshold. 
    '''
    def _init_(self, min_list, threshold):
        self.min_list = min_list
        self.threshold = threshold
        self.hml_features = defaultdit([])
        
    def fit(self, dataseries, y):
        cat_counts = dataseries.value_counts()
        self.ylevels = y.unique()
        # restrict to records with listings more than the min_list
        elig_data = data[data.isin(cat_counts[cat_counts>=self.min_list].index.values)]
        for d in elig_data.unique():
            y_pectages = self.ypect(dataseries, y, d)
            for ylevel in self.ylevels: 
                try:
                    if y_pectages[ylevel] >= threshold:
                        self.hml_features[ylevel].append(d)
                        break
                except:
                    pass
        return self
    
    def transform(self, dataseries):
        return dataseries.apply(self.single_transform)
        
    def y_pect(self, dataseries, y, category):
        return y[dataseries==category].count_values(normalize=True)
    
    def single_transform(self, category):
        return np.array([(category in v) for v in self.hml_features.values])
        
        
        
        
        
        
        
        
        
        
        
        

In [38]:
def DateProcessor(TransformMixin):
    '''Returns the year, month and hour of the date'''
    def _init_(self, wantyear=False, wantmonth=False, wanthour=True):
        self.wantyear= wantyear
        self.wantmonth = wantmonth
        self.wanthour = wanthour
        
    def fit(self):
        return self
    
    def transform(self, dataset):
        return dataset['created'].apply(self).iloc[:, [self.wantyear, self.wantmonth, self.wanthour]]
    
    def process_date(self, date):
        year = date[:4]
        month = date[5:7]
        hour = date[11:13]
        return pd.Series([year, month, hour], index=('year', 'month', 'hour'))


In [None]:
def AddressCleanser(TransformMixin):
    '''Clean the address.
        Strip and lowcase the address. Standardize synonyms into one expression. 
    '''
    def _init_(self, synonyms):
        self.synonyms = synonyms
        
    def fit(self):
        return self
    
    def transform(self, dataseries):
        return dataseries.apply(self.clean_address)
        
    def clean_address(self, address):
        address = address.strip()
        address = address.lower()        
        for s1, s2 in aself.synonyms:
            address = re.sub(s1, s2, address)
        return address
        

In [12]:
# read in the data 
# read in the training data set
train_set = pd.read_json(r'..\data\train.json')
test_set = pd.read_json(r'..\data\test.json')
print("Read in training data set with size of {} * {}".format(train_set.shape[0], train_set.shape[1]))
print("Read in testing data set with size of {} * {}".format(test_set.shape[0], test_set.shape[1]))


Read in training data set with size of 49352 * 15
Read in testing data set with size of 74659 * 14
