In [1]:
#=====================================================================================================
# Author: Ben Grauer
# Purpose: General notebook for feature / model runs and submission (after all pre-processing of files)
#
#=====================================================================================================

In [2]:
import pandas as pd
import numpy as np

import datetime

import matplotlib.pyplot as plt


from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn import svm
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor

import tensorflow as tf

import statsmodels.api as sm

  from numpy.core.umath_tests import inner1d


In [3]:
workDir = 'D:/project/data/kg_avito_demand/'
fileTest = 'test.csv'
fileTrain = 'train.csv'

In [4]:
# Standard function to output submission files
def output_submission_file(df, modelType):
    # Set a time-stamp
    timestamp = datetime.datetime.today().strftime('%Y%m%d_%H%M%S')
    # output the file
    df.to_csv(index=False, path_or_buf=workDir + '/results/results_' + modelType + '_' + timestamp + '.csv')

## Data Loading

In [5]:
# Use the helper script
import helper
dfTrain, dfTest = helper.fn_LoadData(workDir + fileTrain, workDir + fileTest)

Loading Train and Test Files
0:00:29.147204
Adding Stats Features
Total Train Records: 1503424
Total Test Records: 508438
0:00:59.490065
Cleaning data
0:01:02.468101
Loading Image + Text Data
Loading Train Image Stat Files (3-splits)
0:01:36.308608
Loading Test Image Stat Files
0:01:47.407927
Loading Text Stat Files
0:02:15.903773
Merging Image Stat Files
0:02:30.979460
Mergeing Text Stat Files
Finished Loading and Merging Image and Text Files
0:02:35.561207
:-) Train: 1503424  Super: 1503424


In [6]:
dfTrain.head(2)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,desc_numNumericWords,desc_numUpperCaseWords,desc_avgImportantWordLength,desc_avgAllWordLength,desc_numSentences,desc_avgWordsPerSentence,desc_avgWordLengthPerSentence,blur_quality,image_whitespace,image_present
0,b912c3c6a6ad,e00f8ff2eaf9,Sverdlovsk region,Ekaterinburg,Personal things,Goods for children and toys,Bed dress,,,Kokobi (cocoon for sleep),...,0,0,6.666667,4.818182,1,6.0,6.666667,good,0.0,1
1,2dac0150717d,39aeb48f0017,Samara Region,Samara,For home and cottages,Furniture and interior,Other,,,Rack for Clothes,...,0,0,6.0,4.0,2,2.5,6.0,good,0.0,1


In [7]:
dfTest.head(2)

Unnamed: 0,item_id,user_id,region,city,parent_category_name,category_name,param_1,param_2,param_3,title,...,desc_numNumericWords,desc_numUpperCaseWords,desc_avgImportantWordLength,desc_avgAllWordLength,desc_numSentences,desc_avgWordsPerSentence,desc_avgWordLengthPerSentence,blur_quality,image_whitespace,image_present
0,6544e41a8817,dbe73ad6e4b5,Volgograd region,Volgograd,Personal things,Children's clothing and footwear,For boys,Footwear,25.0,I'll give free of charge,...,0,0,4.5,4.0,1,2.0,4.5,average,0.0,1
1,65b9484d670f,2e11806abe57,Sverdlovsk region,Lower Tour,Hobbies and Recreation,Bicycles,Road,,,Selling a bicycle,...,0,2,6.090909,4.05,3,3.666667,6.111111,,,0


## Convert category features

In [8]:
# Category Name
train_category_name_dummies = pd.get_dummies(dfTrain['category_name'])
test_category_name_dummies = pd.get_dummies(dfTest['category_name'])

# User Type
train_user_type_dummies = pd.get_dummies(dfTrain['user_type'])
test_user_type_dummies = pd.get_dummies(dfTest['user_type'])
# train_city_dummies = pd.get_dummies(dfTrain['city']) # won't work

# Region
train_region_dummies = pd.get_dummies(dfTrain['region'])
test_region_dummies = pd.get_dummies(dfTest['region'])

# Parent Category Name
train_parent_category_name_dummies = pd.get_dummies(dfTrain['parent_category_name'])
test_parent_category_name_dummies = pd.get_dummies(dfTest['parent_category_name'])

# Blur Quality
train_blur_quality_dummies = pd.get_dummies(dfTrain['blur_quality'])
test_blur_quality_dummies = pd.get_dummies(dfTest['blur_quality'])

## Setup the different models

In [9]:
# Running of different types of models

# FIRST MODEL - 
trainModel1 = pd.concat([train_category_name_dummies, train_user_type_dummies], axis=1)
testModel1 = pd.concat([test_category_name_dummies, test_user_type_dummies], axis=1)
model1Variables = ['category_name','user_type']


# SECOND MODEL
# categoryname, usertype, city - bad rsme
#newTrain = pd.concat([train_category_name_dummies, train_user_type_dummies, train_city_dummies, dfTrain[['deal_probability']]], axis=1)
#newTest = pd.concat([test_category_name_dummies, test_user_type_dummies, test_city_dummies], axis=1)

# Fourth MODEL - categoryname, usertype, price
trainModel4 = pd.concat([train_category_name_dummies, train_user_type_dummies, dfTrain[['price']]], axis=1)
testModel4 = pd.concat([test_category_name_dummies, test_user_type_dummies, dfTest['price']], axis=1)
# Standardize the values
model4Variables = ['category_name','user_type','price']

# MODEL 5 - parent_category_name, usertype
model5Variables = ['parent_category_name','user_type']
trainModel5 = pd.concat([train_parent_category_name_dummies, train_user_type_dummies], axis=1)
testModel5 = pd.concat([test_parent_category_name_dummies, test_user_type_dummies], axis=1)
# Standardize the values


# MODEL 6 - parent_category_name, usertype, price - not real improvement
model6Variables = ['parent_category_name','user_type','price']
trainModel6 = pd.concat([train_parent_category_name_dummies, train_user_type_dummies, dfTrain[['price']]], axis=1)
testModel6 = pd.concat([test_parent_category_name_dummies, test_user_type_dummies, dfTest['price']], axis=1)
# Standardize the values


# Model 7 - lots of params
model7Variables = ['category_name','user_type', 'price','price_mean','item_seq_number_mean']
trainModel7 = pd.concat([train_category_name_dummies, train_user_type_dummies, dfTrain[['price','price_mean','item_seq_number_mean','deal_probability']]], axis=1)
testModel7 = pd.concat([test_category_name_dummies, test_user_type_dummies, dfTest[['price','price_mean','item_seq_number_mean']]], axis=1)


model8Variables = ['category_name', 'price','price_mean','item_seq_number_mean']
trainModel8 = pd.concat([train_category_name_dummies, dfTrain[['price','price_mean','item_seq_number_mean','deal_probability']]], axis=1)
testModel8 = pd.concat([test_category_name_dummies, dfTest[['price','price_mean','item_seq_number_mean']]], axis=1)

# Model 9 - Include Text Features
# desc_avgWordLengthPerSentence - Need to replace or impute
# desc_wordsCondition - Need to analya
textCols = ['desc_numWords','desc_numStopWords','desc_numNouns','desc_numVerbs','desc_numAdjs',\
'desc_numSymbols','desc_numNonASCIIWords','desc_numNumericWords','desc_numUpperCaseWords',\
'desc_avgImportantWordLength','desc_avgAllWordLength','desc_numSentences','desc_avgWordsPerSentence']

model9Variables = ['category_name', 'user_type', 'text columns - minus 2']
trainModel9 = pd.concat([train_category_name_dummies, train_user_type_dummies, \
                         dfTrain[textCols], dfTrain[['deal_probability']]], axis=1)
testModel9 = pd.concat([test_category_name_dummies, test_user_type_dummies, \
                        dfTest[textCols]], axis=1)

# Model10 - with some Image Features
imageCols = ['clrchn_b_mean','clrchn_g_mean','clrchn_r_mean','blurColorScale','brightness','contrastColorScale']

model10Variables = ['user_type', 'beginner image columns']
trainModel10 = pd.concat([train_category_name_dummies, train_user_type_dummies, \
                         dfTrain[imageCols]], axis=1)
testModel10 = pd.concat([test_category_name_dummies, test_user_type_dummies, \
                        dfTest[imageCols]], axis=1)


# MODEL 11 - parent_category_name, usertype, price - not real improvement
model11Variables = ['parent_category_name','user_type','price','textcol','imagecol']
trainModel11 = pd.concat([train_parent_category_name_dummies, train_user_type_dummies, \
                          dfTrain[['price']], dfTrain[textCols], dfTrain[imageCols]], axis=1)
testModel11 = pd.concat([test_parent_category_name_dummies, test_user_type_dummies, \
                         dfTest[['price']], dfTest[textCols], dfTest[imageCols]], axis=1)

# MODEL 12 - Just Image + Text Features
model12Variables = ['textCol','imagecol']
trainModel12 = pd.concat([dfTrain[textCols], dfTrain[imageCols]], axis=1)
testModel12 = pd.concat([dfTest[textCols], dfTest[imageCols]], axis=1)

# MODEL 13 - 
model13Variables = ['contrast','brightness','blur']
#textCols = ['contrastColorScale','brightness','blurColorScale']
trainModel13 = pd.concat([dfTrain['region'], train_category_name_dummies, dfTrain[textCols], dfTrain[imageCols]], axis=1)
testModel13 = pd.concat([dfTest['region'], test_category_name_dummies, dfTest[textCols], dfTest[imageCols]], axis=1)

# MODEL 13 v2
model13Variables = ['contrast','brightness','blur','price']
imageCols = ['contrastColorScale','brightness','blurColorScale']
trainModel13 = pd.concat([dfTrain['region'], train_parent_category_name_dummies, train_user_type_dummies, dfTrain[['contrastColorScale','brightness','blurColorScale','price']]], axis=1)
testModel13 = pd.concat([dfTest['region'], test_parent_category_name_dummies, test_user_type_dummies, dfTest[['contrastColorScale','brightness','blurColorScale','price']]], axis=1)

# NExt - just try withou the categories?  See which coefficients are brining down.
# Or group by the category name, and then do the images + price
model13Variables = ['brightness','blurColorScale']
imageCols = ['brightness','blurColorScale']
trainModel14 = pd.concat([train_user_type_dummies, dfTrain[imageCols]], axis=1)
testModel14 = pd.concat([test_user_type_dummies, dfTest[imageCols]], axis=1)

trainModel15 = pd.concat([train_parent_category_name_dummies, train_user_type_dummies, \
                          dfTrain[['desc_numVerbs','desc_numAdjs','desc_avgWordsPerSentence']]], axis=1)
testModel15 = pd.concat([test_parent_category_name_dummies, test_user_type_dummies, \
                         dfTest[['desc_numVerbs','desc_numAdjs','desc_avgWordsPerSentence']]], axis=1)


# NEXT ROUND - STARTING OVER
trainModel20 = pd.concat([train_parent_category_name_dummies, train_category_name_dummies, train_user_type_dummies, \
                          dfTrain[['desc_numVerbs']]], axis=1)
testModel20 = pd.concat([test_parent_category_name_dummies, test_category_name_dummies, test_user_type_dummies, \
                         dfTest[['desc_numVerbs']]], axis=1)
model1Variables = ['category_name','user_type']

# number of verbs + image white space
trainModel21 = pd.concat([train_parent_category_name_dummies, train_category_name_dummies, train_user_type_dummies, train_blur_quality_dummies, \
                          dfTrain[['desc_numVerbs','image_whitespace','image_present']]], axis=1)
testModel21 = pd.concat([test_parent_category_name_dummies, test_category_name_dummies, test_user_type_dummies, test_blur_quality_dummies, \
                         dfTest[['desc_numVerbs','image_whitespace','image_present']]], axis=1)

trainModel22 = pd.concat([train_category_name_dummies, train_user_type_dummies, train_blur_quality_dummies, \
                          dfTrain[['image_whitespace','image_present']]], axis=1)
testModel22 = pd.concat([test_category_name_dummies, test_user_type_dummies, test_blur_quality_dummies, \
                         dfTest[['image_whitespace','image_present']]], axis=1)

trainModel = trainModel22
testModel = testModel22
bUseImages = 0
bUseText= 0

# Fill NA - in the future - fill with median
trainModel.fillna(0, inplace=True)
testModel.fillna(0, inplace=True)


# Standardize the values - if we pull in something other than one-hot encoding
scaleValues = 1
if scaleValues == 1:
    
    scaler = MinMaxScaler(feature_range = (0, 1))
 
    if bUseImages == 1:
        trainModel[imageCols] = scaler.fit_transform(trainModel[imageCols])
        testModel[imageCols] = scaler.fit_transform(testModel[imageCols])
    if bUseText == 1:
        trainModel[textCols] = scaler.fit_transform(trainModel[textCols])
        testModel[textCols] = scaler.fit_transform(testModel[textCols])

# Add the deal probability afterward - do not want to scale that
trainModel = pd.concat([trainModel, dfTrain['deal_probability']], axis=1)


# Here we are splitting two types of data sets 
# One built for binary classification models, and one without.
# Auto-config the Binary dataset
# copy
trainModelBinary = trainModel.copy()
# Drop probability
trainModelBinary = trainModelBinary.drop('deal_probability',axis=1)
# Add Deal Binary
trainModelBinary = pd.concat([trainModelBinary, dfTrain[['deal_binary']]],axis=1)
# Test stays the same
testModelBinary = testModel

# check for a mis-copy here when merging above
if len(testModel) > 1000000 or len(testModelBinary) > 1000000:
    print('!!! Bad Test Size !!! - Check concat variables')

print('finished')

finished


In [10]:
# Review Data (if need be)
# trainModel.head(5)
# testModel.head(5)

In [11]:
# trainModelBinary.head(5)
# testModelBinary.head(5)

## Generate a base line - from the mean probability of the groups below
Group by region, city, category_name, user_type

In [12]:
def generate_mean_submission():

    # 7,000 missing with this
    groupColumnList = ['region','city','category_name','user_type']
    joinColumns = ['region','city','category_name','user_type']

    dfLevel1Mean = helper.fn_AddStatToDataframe(dfTrain, groupColumnList, 'deal_probability', 'mean')
    dfTestMean = pd.merge(dfTest, dfLevel1Mean, how='left', on =joinColumns)

    # 106 missing with this
    groupColumnList = ['region','category_name','user_type']
    joinColumns = ['region','category_name','user_type']

    dfLevel2Mean = helper.fn_AddStatToDataframe(dfTrain, groupColumnList, 'deal_probability', 'mean')
    dfLevel2Mean.rename(columns={'deal_probability_mean':'deal_probability_mean_l2'}, inplace=True)
    dfTestMean = pd.merge(dfTestMean, dfLevel2Mean, how='left', on =joinColumns)


    # 0 missing with this
    groupColumnList = ['category_name','user_type']
    joinColumns = ['category_name','user_type']

    dfLevel3Mean = helper.fn_AddStatToDataframe(dfTrain, groupColumnList, 'deal_probability', 'mean')
    dfLevel3Mean.rename(columns={'deal_probability_mean':'deal_probability_mean_l3'}, inplace=True)
    dfTestMean = pd.merge(dfTestMean, dfLevel3Mean, how='left', on =joinColumns)


    # Fill in for Level2, then for Level3
    dfTestMean.loc[dfTestMean['deal_probability_mean'].isnull(), ('deal_probability_mean')] = dfTestMean['deal_probability_mean_l2']
    dfTestMean.loc[dfTestMean['deal_probability_mean'].isnull(), ('deal_probability_mean')] = dfTestMean['deal_probability_mean_l3']

    print('Number of null deal_probability: ' + str(np.count_nonzero(dfTestMean['deal_probability_mean'].isnull().values)))

    dfTestMean = dfTestMean[['item_id','deal_probability_mean']].copy()
    dfTestMean.rename(columns={'deal_probability_mean':'deal_probability'}, inplace=True)

    # Export to csv
    output_submission_file(dfTestMean, 'base')

# when I generate
# generate_mean_submission

# Functions for Models

In [13]:
def run_test_model(model, binary, df, suppressMessages = False):
  
    if binary == False:
        
        # Split
        X_train, X_test, y_train, y_test = train_test_split(df.drop('deal_probability',axis=1), 
                                                df['deal_probability'], test_size=0.30, 
                                                random_state=101)
        
        model.fit(X_train,y_train)
        modelPredictions = model.predict(X_test)
    
        # The mean squared error    
        mae = mean_absolute_error(y_test, modelPredictions)
        mse = np.round(mean_squared_error(y_test, modelPredictions), 4)
        r2 = np.round(r2_score(y_test, modelPredictions), 4)
        if suppressMessages == False:
            print("Mean Abs error: %.2f" % mae)
            print("Mean squared error: %.2f" % mse)
            print('Variance (r2) score: %.2f' % r2)
        
        return mse, r2
 
    elif binary == True:

        # Split
        X_train, X_test, y_train, y_test = train_test_split(df.drop('deal_binary',axis=1), 
                                                df['deal_binary'], test_size=0.30, 
                                                random_state=101)
        
        model.fit(X_train,y_train)
        modelPredictions = model.predict_proba(X_test)

        # The mean squared error
        mse0 = mean_squared_error(y_test, modelPredictions[:,:1])
        var0 = r2_score(y_test, modelPredictions[:,:1])
        #print("Mean Abs error: %.2f" % mean_absolute_error(y_test, modelPredictions))
        print("Mean squared error (0): %.2f" % mse0)
        print('Variance (r2) score (0): %.2f' % var0)

        # Closer to 1
        mse1 = mean_squared_error(y_test, modelPredictions[:,1:2])
        var1 = r2_score(y_test, modelPredictions[:,1:2])
        #print("Mean Abs error: %.2f" % mean_absolute_error(y_test, modelPredictions))
        print("Mean squared error (1): %.2f" % mse1)
        print('Variance (r2) score (1): %.2f' % var1) 
        
    return 

In [14]:
# Run each of the models one at a time - kind of struggled to see which one woudl perform the best in this competition
print('===================')
print('Linear Regression:')
run_test_model(linear_model.LinearRegression(), False, trainModel)
print('===================')
print('Ridge Regression:')
run_test_model(Ridge(), False, trainModel)
print('===================')
print('Lasso Regression:')
run_test_model(Lasso(), False, trainModel)
print('===================')
print('ElasticNet Regression:')
run_test_model(ElasticNet(), False, trainModel)
print('===================')
print('Decision Tree Regressor:')
run_test_model(DecisionTreeRegressor(max_depth=10), False, trainModel)
print('===================')
print('Random Forest Regressor:')
run_test_model(RandomForestRegressor(), False, trainModel)
run_test_model(RandomForestRegressor(max_depth=2, random_state=0), False, trainModel)


print('\n')
print('===================')
print('Logistic Regresion:')
run_test_model(LogisticRegression(), True, trainModelBinary)
print('===================')
print('RandomForestClassifier (depth 1):')
run_test_model(RandomForestClassifier(max_depth=1), True, trainModelBinary)
print('-------------------')
print('RandomForestClassifier (depth 2):')
run_test_model(RandomForestClassifier(max_depth=2), True, trainModelBinary)
print('-------------------')
print('RandomForestClassifier (depth 3):')
run_test_model(RandomForestClassifier(max_depth=3), True, trainModelBinary)



print('===================')
print('DecisionTreeClassifier:')
run_test_model(DecisionTreeClassifier(), True, trainModelBinary)
print('===================')
print('GaussianNB:')
run_test_model(GaussianNB(), True, trainModelBinary)
print('===================')

print('Finsihed Running Models')

Linear Regression:
Mean Abs error: 0.17
Mean squared error: 0.06
Variance (r2) score: 0.14
Ridge Regression:
Mean Abs error: 0.17
Mean squared error: 0.06
Variance (r2) score: 0.14
Lasso Regression:
Mean Abs error: 0.19
Mean squared error: 0.07
Variance (r2) score: -0.00
ElasticNet Regression:
Mean Abs error: 0.19
Mean squared error: 0.07
Variance (r2) score: -0.00
Decision Tree Regressor:
Mean Abs error: 0.17
Mean squared error: 0.06
Variance (r2) score: 0.13
Random Forest Regressor:
Mean Abs error: 0.17
Mean squared error: 0.06
Variance (r2) score: 0.14
Mean Abs error: 0.18
Mean squared error: 0.06
Variance (r2) score: 0.07


Logistic Regresion:
Mean squared error (0): 0.47
Variance (r2) score (0): -1.05
Mean squared error (1): 0.18
Variance (r2) score (1): 0.22
RandomForestClassifier (depth 1):
Mean squared error (0): 0.33
Variance (r2) score (0): -0.45
Mean squared error (1): 0.22
Variance (r2) score (1): 0.05
-------------------
RandomForestClassifier (depth 2):
Mean squared error

In [15]:
# Run a non-binary model
def run_production_model_non_binary(model, model_name, binary, dfTr, dfTe, dfTestInput):
        
    # Split
    X_train = dfTr.drop('deal_probability',axis=1)
    y_train = dfTr['deal_probability']
    X_test = dfTe
    y_test = dfTe.index

    model.fit(X_train,y_train)
    modelPredictions = model.predict(X_test)

    # Assign Predictions
    X_test['deal_probability'] = modelPredictions

    # Export item_id, deal_probability
    finalOutputDF = pd.concat((dfTestInput[['item_id']], X_test[['deal_probability']]), axis=1)

    # See how many need to be cleaned up
    print('Num records less than 0: ' + str(len(finalOutputDF[finalOutputDF['deal_probability'] < 0])))

    # Clean
    finalOutputDF.loc[finalOutputDF['deal_probability'] < 0, 'deal_probability'] = 0

    # Export to csv
    output_submission_file(finalOutputDF, model_name)

# Run a binary model
def run_production_model_binary(model, model_name, binary, probabilityTowardsOne, dfTr, dfTe, dfTestInput):
           
    # Split
    X_train = dfTr.drop('deal_binary',axis=1).copy()
    y_train = dfTr['deal_binary'].copy()
    X_test = dfTe.copy()
    y_test = dfTe.index
    
    # Fit the model
    model.fit(X_train,y_train)

    # Assign Predictions
    if probabilityTowardsOne == False:
        modelPredictions = model.predict_proba(X_test)
        X_test['deal_probability'] = modelPredictions[:,:1]
    elif probabilityTowardsOne ==True:
        modelPredictions = model.predict_proba(X_test)
        X_test['deal_probability'] = modelPredictions[:,1:2]

    # Export item_id, deal_probability
    finalOutputDF = pd.concat((dfTestInput[['item_id']], X_test[['deal_probability']]), axis=1)

    # See how many need to be cleaned up
    print('Num records less than 0: ' + str(len(finalOutputDF[finalOutputDF['deal_probability'] < 0])))

    # Clean
    finalOutputDF.loc[finalOutputDF['deal_probability'] < 0, 'deal_probability'] = 0

    # Export to csv
    output_submission_file(finalOutputDF, model_name)

In [16]:
# Run different variations of the model

#run_production_model_non_binary(linear_model.LinearRegression(), 'linReg', False, trainModel, testModel, dfTest)

run_production_model_non_binary(RandomForestRegressor(), 'DecTreeReg', False, trainModel, testModel, dfTest)

#run_production_model_binary(model=LogisticRegression(), model_name='LogReg',\
#                            binary=True, probabilityTowardsOne=True, \
#                            dfTr=trainModelBinary, dfTe=testModelBinary, dfTestInput=dfTest)

Num records less than 0: 0
