In [4]:
import numpy as np
import scipy as sp
import pandas as pd

In [5]:
import statsmodels.api as sm

In [6]:
import math
import models

In [7]:
# import csv data

sales_df = pd.read_csv(r'..\..\data\UCBerkeley_Sales_data.csv')
services_df = pd.read_csv(r'..\..\data\UCBerkeley_After_Sales_data.csv')
survey_df = pd.read_csv(r'..\..\data\UCBerkeley_Survey_Data.csv')

In [8]:
# remove any duplicates in the data (does not consider date because some obvious duplicates have slightly different dates)

sales_df = sales_df.drop_duplicates(subset=['CUST_ID','HH_ID','MOD_YEAR', 'MOD_DESC', 'TOT_MSRP', 'TRANS_TYPE', 'CONTRACT_TYPE'])

KeyError: Index(['HH_ID'], dtype='object')

In [6]:
# generate a separate dataframe for each customer

sales_dfs = [x for _, x in sales_df.groupby(sales_df['CUST_ID'])]

In [7]:
# Calculate how many customers have purchased x cars from the start to end dates of data,
# where x is the index of the list "count"

count = [0]*39
for customer in sales_dfs:
    num_trans = len(customer)
    count[num_trans] += 1
for index in reversed(range(0,len(count)-1)):
    count[index] += count[index+1]
count[0] = "NA"
print(count)

['NA', 5000, 1814, 861, 466, 250, 142, 81, 59, 40, 30, 17, 13, 12, 9, 6, 5, 5, 5, 5, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [8]:
# testing

sales_dfs[4]

Unnamed: 0,CUST_ID,HH_ID,DATE,MOD_YEAR,MOD_CLASS,MOD_DESC,TOT_MSRP,CUST_VEH_ROLE,TRANS_TYPE,CONTRACT_TYPE
2420,US1000001643104169,5900000004212930,6/17/2017,2015,GL,GL350BTC,78875.0,Owner,Pre-owned,Retail


In [9]:
# utility functions to convert date strings to encoded (comparable) int

def encode_month(date_str):
    date = date_str.split('/')
    return int(date[0]) + int(date[2])*12
def encode_date(date_str):
    date = date_str.split('/')
    return (int(date[0]) - 1)*31 + (int(date[1]) - 1) + int(date[2])*372

In [10]:
# utility functions for calculating customer features

def purchase_freq(customer_history, start, end):
    num_trans = 0
    for date in customer_history['DATE'].values:
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            num_trans += 1
    return num_trans

def max_purchase(customer_history, start, end):
    max_value = 0
    for index in range(len(customer_history.values)):
        date = customer_history['DATE'].values[index]
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            max_value = max(customer_history['TOT_MSRP'].values[index], max_value)
    return max_value

def min_purchase(customer_history, start, end):
    min_value = float('inf')
    for index in range(len(customer_history.values)):
        date = customer_history['DATE'].values[index]
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            min_value = min(customer_history['TOT_MSRP'].values[index], min_value)
    return min_value

def total_revenue(customer_history, start, end):
    total = 0
    for index in range(len(customer_history.values)):
        date = customer_history['DATE'].values[index]
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            total += customer_history['TOT_MSRP'].values[index]
    return total

# Need to confirm this is correct
def model_purchase_gap(customer_history, start, end):
    release_month = 6
    for index in range(len(customer_history.values)):
        sum_of_diff = 0
        date = customer_history['DATE'].values[index]
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            purchase_month = encode_month(date)
            model_year = customer_history['MOD_YEAR'].values[index]
            model_month = (model_year - 1) * 12 + release_month
            sum_of_diff += (purchase_month - model_month)
    return sum_of_diff
    
def distinct_classes(customer_history, start, end):
    classes = set()
    for index in range(len(customer_history.values)):
        date = customer_history['DATE'].values[index]
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            classLetter = customer_history['MOD_CLASS'].values[index].strip()
            classes.add(classLetter)
    return len(classes)

def retail_purchases(customer_history, start, end):
    num_retail = 0
    for index in range(len(customer_history.values)):
        date = customer_history['DATE'].values[index]
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            contract = customer_history['CONTRACT_TYPE'].values[index]
            if contract == "Retail": num_retail += 1
    return num_retail

def purchase_gaps(customer_history, start, end):
    first_to_last = 0
    for index in range(len(customer_history.values)):
        date = customer_history['DATE'].values[index]
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            contract = customer_history['CONTRACT_TYPE'].values[index]
            if contract == "Retail": num_retail += 1
    return num_retail

def purchase_indicator(customer_history, start, end):
    for index in range(len(customer_history.values)):
        date = customer_history['DATE'].values[index]
        encoded = encode_date(date)
        if encoded >= start and encoded < end:
            return True
    return False

In [11]:
# same counter as before, except smaller date range to account for lack of 2003, 2004 data

x_1 = encode_date("10/23/2005")
x_2 = encode_date("10/23/2018")
count_x = [0]*39

for customer in sales_dfs:
    freq = purchase_freq(customer, x_1, x_2)
    count_x[freq] += 1
for index in reversed(range(0,len(count)-1)):
    count_x[index] += count_x[index+1]
count_x[0] = "NA"
print(count_x)


['NA', 4874, 1751, 835, 445, 236, 129, 77, 56, 36, 25, 17, 12, 11, 8, 6, 5, 5, 4, 4, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [24]:
#Generates list of customers that purchased at least one car in 2 disjoint (or overlapping) date ranges

x_1 = encode_date("10/23/2005")
x_2 = encode_date("10/23/2011")
count_x = [0]*39
subset = []
for customer in sales_dfs:
    freq = purchase_freq(customer, x_1, x_2)
    if freq != 0:
        subset.append(customer['CUST_ID'].values[0])
    count_x[freq] += 1
for index in reversed(range(0,len(count)-1)):
    count_x[index] += count_x[index+1]
count_x[0] = "NA"
print(count_x)
print(len(subset))

x_1 = encode_date("10/23/2005")
x_2 = encode_date("10/23/2011")
count_x = [0]*39
subset2 = []
for customer in sales_dfs:
    freq = purchase_freq(customer, x_1, x_2)
    if freq != 0:
        subset2.append(customer['CUST_ID'].values[0])
    count_x[freq] += 1
for index in reversed(range(0,len(count)-1)):
    count_x[index] += count_x[index+1]
count_x[0] = "NA"
print(count_x)
print(len(subset2))
def intersection(lst1, lst2): 
    lst3 = [value for value in lst1 if value in lst2] 
    return lst3
subset = intersection(subset,subset2)


['NA', 1915, 459, 145, 50, 18, 8, 5, 4, 4, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1915
['NA', 1915, 459, 145, 50, 18, 8, 5, 4, 4, 2, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
1915


In [25]:
# Collecting customer metrics from start to end date
# Calculating revenue from rev_start to rev_end date

customer_summary = dict()
start    = encode_date("10/23/2008")
end      = encode_date("10/23/2013")
resp_start    = encode_date("10/23/2013")
resp_end      = encode_date("10/23/2018")
interval = 9

customer = sales_dfs[0]
customer = Customer(customer, "NOT IMPLEMENTED", "NOT IMPLEMENTED", 
                   start, end, resp_start, resp_end)
print(customer.summary)

for customer in sales_dfs:
    customer_ID = customer['CUST_ID'].values[0]
    if customer_ID in subset:
        summary = []
        total_purchases = purchase_freq(customer, start, end)
        summary.append(
            total_purchases / interval
        )
        if purchase_freq(customer, start, end) == 0: 
            customer_summary[customer_ID] = 0
            continue
        summary.append(
            max_purchase(customer, start, end)
        )
        summary.append(
            min_purchase(customer, start, end)
        )
        summary.append(
            model_purchase_gap(customer, start, end) / total_purchases
        )
        summary.append(
            distinct_classes(customer, start, end) / total_purchases
        )
        summary.append(
            retail_purchases(customer, start, end) / total_purchases
        )
        revenue = total_revenue(customer, resp_start, resp_end)
        purchase = purchase_indicator(customer, resp_start, resp_end)
        customer_summary[customer_ID] = (summary, purchase)
    

In [26]:
# Generates a data frame consisting of all parameters (independent and dependent)

customer_summary_df = []
for summary in customer_summary.values():
    if summary != 0 and  not math.isnan(summary[1]):
        customer_summary_df.append(summary[0]+[summary[1]])

X = pd.DataFrame(np.array(customer_summary_df), columns = ['freq','max','min','enthu', 'classes','retail',  'purchase'])
X

Unnamed: 0,freq,max,min,enthu,classes,retail,purchase
0,0.111111,34855.0,34855.0,55.000000,1.000000,1.000000,0.0
1,0.111111,51440.0,51440.0,39.000000,1.000000,1.000000,0.0
2,0.111111,93270.0,93270.0,16.000000,1.000000,0.000000,0.0
3,0.111111,57675.0,57675.0,12.000000,1.000000,1.000000,0.0
4,0.222222,105485.0,71295.0,0.000000,1.000000,0.000000,1.0
5,0.222222,64100.0,38010.0,2.000000,1.000000,0.000000,1.0
6,0.111111,56525.0,56525.0,4.000000,1.000000,0.000000,0.0
7,0.111111,56040.0,56040.0,5.000000,1.000000,1.000000,1.0
8,0.111111,65860.0,65860.0,21.000000,1.000000,1.000000,0.0
9,0.111111,53415.0,53415.0,0.000000,1.000000,1.000000,1.0


In [27]:
# runs regression with specified dependent variables

est = sm.OLS(X['purchase'], X[['freq','enthu', 'classes']]).fit()

est.summary()

0,1,2,3
Dep. Variable:,purchase,R-squared:,0.546
Model:,OLS,Adj. R-squared:,0.545
Method:,Least Squares,F-statistic:,513.5
Date:,"Wed, 31 Oct 2018",Prob (F-statistic):,4.71e-219
Time:,18:53:52,Log-Likelihood:,-806.97
No. Observations:,1284,AIC:,1620.0
Df Residuals:,1281,BIC:,1635.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
freq,1.3656,0.098,13.906,0.000,1.173,1.558
enthu,-0.0083,0.001,-13.810,0.000,-0.010,-0.007
classes,0.3514,0.024,14.925,0.000,0.305,0.398

0,1,2,3
Omnibus:,532.935,Durbin-Watson:,1.87
Prob(Omnibus):,0.0,Jarque-Bera (JB):,66.858
Skew:,0.045,Prob(JB):,3.03e-15
Kurtosis:,1.886,Cond. No.,198.0
