#### Importing Library

In [1]:
import pandas as pd
import numpy as np

  from .tslib import iNaT, NaT, Timestamp, Timedelta, OutOfBoundsDatetime
  from pandas._libs import (hashtable as _hashtable,
  from pandas._libs import algos, lib
  from pandas._libs import hashing, tslib
  from pandas._libs import (lib, index as libindex, tslib as libts,
  import pandas._libs.tslibs.offsets as liboffsets
  from pandas._libs import algos as libalgos, ops as libops
  from pandas._libs.interval import (
  from pandas._libs import internals as libinternals
  import pandas._libs.sparse as splib
  import pandas._libs.window as _window
  from pandas._libs import (lib, reduction,
  from pandas._libs import algos as _algos, reshape as _reshape
  import pandas._libs.parsers as parsers
  from pandas._libs import algos, lib, writers as libwriters


#### Reading data set

In [2]:
data = pd.read_csv('data_science_challenge_samp_18.csv',parse_dates=[1],keep_date_col = True)

In [3]:
data.columns

Index([u'cust_id', u'order_date', u'lane_number', u'total_spend',
       u'units_purchased'],
      dtype='object')

In [4]:
print(data.shape)
data = data.drop_duplicates()
print(data.shape)

(323437, 5)
(323021, 5)


In [5]:
data.dtypes

cust_id                     int64
order_date         datetime64[ns]
lane_number                 int64
total_spend                object
units_purchased             int64
dtype: object

#### Customers total number of visits to the store

In [6]:
cust_count = data.cust_id.value_counts().reset_index()
cust_count.columns = ['cust_id','cust_count']

In [7]:
cust_count.head(2)

Unnamed: 0,cust_id,cust_count
0,5183161534,543
1,4302370736,535


In [8]:
data.order_date.min(),data.order_date.max()

(Timestamp('2015-03-20 00:00:00'), Timestamp('2016-03-27 00:00:00'))

In [9]:
data.head()

Unnamed: 0,cust_id,order_date,lane_number,total_spend,units_purchased
0,4239597436,2015-03-20,3,19.84,5
1,4913827536,2015-03-20,3,11.1,5
2,4913827536,2015-03-20,1,10.0,1
3,4271866537,2015-03-20,5,13.68,2
4,5316845735,2015-03-20,4,5.29,1


#### Generating date range between 03-20-2015 to (2016-03-27 + 8 days = 2016-04-03)

In [10]:
from datetime import date, timedelta
d2 = date(2016, 3, 27)  # start date
d1 = date(2015, 3, 20)  # end date
delta = d2 - d1         # timedelta
all_dates = []
for i in range(delta.days + 8):
    all_dates.append(d1 + timedelta(i))

In [11]:
all_cust_id = data.cust_id.unique()

#### Generating all dates entry for each customer

In [12]:
data_cus_id = []
data_dates = []
for c_id in all_cust_id:
    for d in all_dates:
        data_cus_id.append(c_id)
        data_dates.append(d)

#### checking customer id with each date entry through an year and length should be same.

In [13]:
len(data_cus_id), len(data_dates)

(4758309, 4758309)

#### Creating dataframe for new training set

In [14]:
full_data = pd.DataFrame({'cust_id':data_cus_id,'order_date':data_dates})

##### Sanity Check

In [15]:
full_data.cust_id.value_counts().unique()

array([381])

In [16]:
full_data.dtypes

cust_id        int64
order_date    object
dtype: object

#### Converting full_data dataframe to datetime format for order_date column

In [17]:
full_data['order_date'] = pd.to_datetime(full_data.order_date,format= '%Y-%m-%d')

In [18]:
full_data.order_date.min(),full_data.order_date.max()

(Timestamp('2015-03-20 00:00:00'), Timestamp('2016-04-03 00:00:00'))

#### Generating training and test data set

In [19]:
train = full_data.loc[full_data.order_date <= pd.to_datetime('2016-03-27',format ='%Y-%m-%d' )]
test = full_data.loc[full_data.order_date > pd.to_datetime('2016-03-27',format ='%Y-%m-%d' )]

In [20]:
train.cust_id.value_counts().unique(),test.cust_id.value_counts().unique()

(array([374]), array([7]))

In [21]:
test.shape,train.shape,full_data.shape

((87423, 2), (4670886, 2), (4758309, 2))

In [22]:
import gc
del full_data
gc.collect()

56

#### took only first order_date of a customer

In [23]:
print(data.shape)
data = data.groupby(['cust_id','order_date']).head(1)
print (data.shape)

(323021, 5)
(283838, 5)


In [24]:
train = train.merge(data, on=['cust_id','order_date'],how='left')

In [25]:
train['target']=0
train.loc[pd.notnull(train.units_purchased ),'target']=1
train = train.drop(['units_purchased','lane_number','total_spend'],axis=1)

In [26]:
# number of one should be same as the number of data observations and it can be verified as below
train.target.sum() == data.shape[0]

True

In [27]:
train.cust_id.value_counts().unique()

array([374])

In [28]:
# create some features based on date
train['month'] = train.order_date.dt.month
train['weekofyear'] = train.order_date.dt.weekofyear
train['is_month_start'] = train.order_date.dt.is_month_start
train['is_month_end'] = train.order_date.dt.is_month_end
train['day'] = train.order_date.dt.day
train['dayofweek'] = train.order_date.dt.dayofweek

test['month'] = test.order_date.dt.month
test['weekofyear'] = test.order_date.dt.weekofyear
test['is_month_start'] = test.order_date.dt.is_month_start
test['is_month_end'] = test.order_date.dt.is_month_end
test['day'] = test.order_date.dt.day
test['dayofweek'] = test.order_date.dt.dayofweek

customers visited the store more than once in a day but I took only the first visit of the day(we could use the aggregrated values)

In [29]:
train.columns

Index([u'cust_id', u'order_date', u'target', u'month', u'weekofyear',
       u'is_month_start', u'is_month_end', u'day', u'dayofweek'],
      dtype='object')

In [30]:
data['month'] = data.order_date.dt.month
data['weekofyear'] = data.order_date.dt.weekofyear
data['is_month_start'] = data.order_date.dt.is_month_start
data['is_month_end'] = data.order_date.dt.is_month_end
data['day'] = data.order_date.dt.day
data['dayofweek'] = data.order_date.dt.day


In [31]:
data['total_spend'] = data.total_spend.str.replace(',', '')
data['total_spend']=data.total_spend.astype(float)

In [32]:
# in next few llines stats based features are created

In [33]:
k1 = data[['cust_id','total_spend']].groupby('cust_id').agg('mean').reset_index()
k1.columns = ['cust_id','cust_id_total_send_mean']
train = train.merge(k1,on='cust_id',how='left')
test = test.merge(k1,on='cust_id',how='left')

In [34]:
k1.head()

Unnamed: 0,cust_id,cust_id_total_send_mean
0,4005940437,10.096692
1,4005940737,9.021429
2,4005941333,29.45
3,4005941434,7.773415
4,4005941533,28.092174


In [35]:
import matplotlib.pyplot as plt
k1.plot(x='cust_id', y='cust_id_total_send_mean', kind='bar') 
plt.show()

<matplotlib.figure.Figure at 0x7f9876557d90>

In [36]:
#k1 = data[['cust_id','total_spend']].groupby('cust_id').agg('mean').reset_index()
#k1.columns = ['cust_id','cust_id_total_send_mean']
#train = train.merge(k1,on='cust_id',how='left')
#test = test.merge(k1,on='cust_id',how='left')

#k2 = data[['cust_id','total_spend']].groupby('cust_id').agg('median').reset_index()
#k2.columns = ['cust_id','cust_id_total_send_median']
#train = train.merge(k2,on='cust_id',how='left')
#test = test.merge(k2,on='cust_id',how='left')

#k3 = data[['cust_id','total_spend']].groupby('cust_id').agg('std').reset_index()
#k3.columns = ['cust_id','cust_id_total_send_std']
#train = train.merge(k3,on='cust_id',how='left')
#test = test.merge(k3,on='cust_id',how='left')

In [37]:
#k1 = data[['cust_id','units_purchased']].groupby('cust_id').agg('mean').reset_index()
#k1.columns = ['cust_id','cust_id_units_purchased_mean']
#train = train.merge(k1,on='cust_id',how='left')
#test = test.merge(k1,on='cust_id',how='left')

#k2 = data[['cust_id','units_purchased']].groupby('cust_id').agg('median').reset_index()
#k2.columns = ['cust_id','cust_id_units_purchased_median']
#train = train.merge(k2,on='cust_id',how='left')
#test = test.merge(k2,on='cust_id',how='left')

#k3 = data[['cust_id','units_purchased']].groupby('cust_id').agg('std').reset_index()
#k3.columns = ['cust_id','cust_id_units_purchased_std']
#train = train.merge(k3,on='cust_id',how='left')
#test = test.merge(k3,on='cust_id',how='left')

#### Feature Engineering
Selecting features considering units_purchased

In [38]:
k1 = data[['cust_id','month','units_purchased']].groupby(['cust_id','month']).agg('mean').reset_index()
k1.columns = ['cust_id','month','cust_id_month_units_purchased_mean']
train = train.merge(k1,on=['cust_id','month'],how='left')
test = test.merge(k1,on=['cust_id','month'],how='left')

k2 = data[['cust_id','month','units_purchased']].groupby(['cust_id','month']).agg('median').reset_index()
k2.columns = ['cust_id','month','cust_id_month_units_purchased_median']
train = train.merge(k2,on=['cust_id','month'],how='left')
test = test.merge(k2,on=['cust_id','month'],how='left')

k3 = data[['cust_id','month','units_purchased']].groupby(['cust_id','month']).agg('std').reset_index()
k3.columns = ['cust_id','month','cust_id_month_units_purchased_std']
train = train.merge(k3,on=['cust_id','month'],how='left')
test = test.merge(k3,on=['cust_id','month'],how='left')

### Customers with their monthly units purchased expenditure mean 

In [39]:
k1.head()

Unnamed: 0,cust_id,month,cust_id_month_units_purchased_mean
0,4005940437,1,2.85
1,4005940437,2,2.772727
2,4005940437,3,4.321429
3,4005940437,4,2.181818
4,4005940437,10,2.333333


Selecting features considering total_spent

In [40]:
k1 = data[['cust_id','month','total_spend']].groupby(['cust_id','month']).agg('mean').reset_index()
k1.columns = ['cust_id','month','cust_id_month_total_spend_mean']
train = train.merge(k1,on=['cust_id','month'],how='left')
test = test.merge(k1,on=['cust_id','month'],how='left')

k2 = data[['cust_id','month','total_spend']].groupby(['cust_id','month']).agg('median').reset_index()
k2.columns = ['cust_id','month','cust_id_month_total_spend_median']
train = train.merge(k2,on=['cust_id','month'],how='left')
test = test.merge(k2,on=['cust_id','month'],how='left')

k3 = data[['cust_id','dayofweek','total_spend']].groupby(['cust_id','dayofweek']).agg('std').reset_index()
k3.columns = ['cust_id','dayofweek','cust_id_dayofweek_total_spend_std']
train = train.merge(k3,on=['cust_id','dayofweek'],how='left')
test = test.merge(k3,on=['cust_id','dayofweek'],how='left')

### Customers with their monthly total spend mean 

In [41]:
k1.head()

Unnamed: 0,cust_id,month,cust_id_month_total_spend_mean
0,4005940437,1,8.8835
1,4005940437,2,16.484091
2,4005940437,3,8.367143
3,4005940437,4,7.41
4,4005940437,10,6.876667


Extracting customers going to different lanes and spending on units purchases. Test and training data set created for the same

In [42]:
k1 = data[['cust_id','lane_number','total_spend']].groupby(['cust_id','lane_number']).agg('mean').reset_index()
k1 = k1.pivot(index='cust_id', columns='lane_number', values='total_spend').reset_index().replace(np.nan,0)
train = train.merge(k1,on=['cust_id'],how='left')
test = test.merge(k1,on=['cust_id'],how='left')

#### Customers visit to different lanes. Store owner can target which customers go to which lane very frequently so that they can offer some deal or promotional offers on the products which are in that lane

In [43]:
k1.head()

lane_number,cust_id,1,2,3,4,5,6,7,8,9,10,11,15,17
0,4005940437,26.138571,8.970088,11.441667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4005940737,0.0,9.53,5.97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4005941333,47.821667,25.25625,20.362,38.1,0.0,0.0,0.0,76.15,0.0,0.0,0.0,0.0,0.0
3,4005941434,3.0,6.005,15.872667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4005941533,33.64,21.349444,14.127,0.0,0.0,0.0,59.119,0.0,0.0,0.0,0.0,0.0,0.0


Extracting customers' total_spend on monthly basis. This analysis shows nature of customer's shopping on monthly basis. Test and training data set created for the same

In [44]:
data['year'] = data.order_date.dt.year

In [45]:
k1 = data[['cust_id','month','total_spend']].groupby(['cust_id','month']).agg('mean').reset_index()
k1 = k1.pivot(index='cust_id', columns='month', values='total_spend').reset_index().replace(np.nan,0)
k1 = k1[['cust_id', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]
k1.columns = ['cust_id', 'm0', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6','m7', 'm8', 'm9', 'm10', 'm11']
train = train.merge(k1,on=['cust_id'],how='left')
test = test.merge(k1,on=['cust_id'],how='left')

Customers total spend mean on monthly basis. It is required to know which customers shop the most on which month

In [46]:
k1.head()

Unnamed: 0,cust_id,m0,m1,m2,m3,m4,m5,m6,m7,m8,m9,m10,m11
0,4005940437,8.8835,16.484091,8.367143,7.41,0.0,0.0,0.0,0.0,0.0,6.876667,11.532632,8.814762
1,4005940737,0.0,8.87,2.89,0.0,0.0,6.99,27.67,4.98,0.0,6.77,0.0,0.0
2,4005941333,30.038,5.832,15.65125,35.025,62.855,30.698333,36.56,37.786667,25.0475,40.795,61.336667,25.955
3,4005941434,3.851429,23.1325,9.701111,5.164,3.3,7.835714,5.8125,8.608,3.911429,6.381,5.85,6.406667
4,4005941533,76.6675,32.33,24.978182,7.375,36.091667,17.352,14.347778,32.46,24.063333,58.685,24.152857,27.966


#### Customer's total spend on yearly basis. ( Note - We have 3 month of data in 2016 year)

In [47]:
data[['cust_id','year','total_spend']].groupby(['cust_id','year']).agg('mean').reset_index()

Unnamed: 0,cust_id,year,total_spend
0,4005940437,2015,9.363662
1,4005940437,2016,10.936129
2,4005940737,2015,10.278000
3,4005940737,2016,5.880000
4,4005941333,2015,37.803333
5,4005941333,2016,16.920000
6,4005941434,2015,5.722167
7,4005941434,2016,13.367727
8,4005941533,2015,24.554545
9,4005941533,2016,41.990000


In [48]:
k1 = data[['cust_id','month','total_spend']].groupby(['cust_id','month']).agg('mean').reset_index()
k1 = k1.pivot(index='cust_id', columns='month', values='total_spend').reset_index().replace(np.nan,0)
k1 = k1[['cust_id', 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]]
k1.columns = ['cust_id', 'm0', 'm1', 'm2', 'm3', 'm4', 'm5', 'm6','m7', 'm8', 'm9', 'm10', 'm11']
train = train.merge(k1,on=['cust_id'],how='left')
test = test.merge(k1,on=['cust_id'],how='left')

In [49]:
k1 = data[['cust_id','month','total_spend']].groupby(['cust_id','month']).agg('count').reset_index()
k1.columns = ['cust_id','month','cust_id_month_count']
train = train.merge(k1,on=['cust_id','month'],how='left')
test = test.merge(k1,on=['cust_id','month'],how='left')

#### Customer's monthly visit count to the store.

In [51]:
k1.head(10)

Unnamed: 0,cust_id,month,cust_id_month_count
0,4005940437,1,20
1,4005940437,2,22
2,4005940437,3,28
3,4005940437,4,11
4,4005940437,10,12
5,4005940437,11,19
6,4005940437,12,21
7,4005940737,2,1
8,4005940737,3,1
9,4005940737,6,1


In [52]:
k1 = data[['cust_id','total_spend']].groupby(['cust_id']).agg('count').reset_index()
k1.columns = ['cust_id','cust_id_count']
train = train.merge(k1,on=['cust_id'],how='left')
test = test.merge(k1,on=['cust_id'],how='left')

#### Each customer's total spend sum throughout the tenure

In [56]:
k1.head()

Unnamed: 0,cust_id,cust_id_count
0,4005940437,133
1,4005940737,7
2,4005941333,45
3,4005941434,82
4,4005941533,69


Below Analysis shows how active the customers were during the first week of month. People get paid on first week so they should shop more too

In [57]:
k1 = data[['cust_id','is_month_start','total_spend']].groupby(['cust_id','is_month_start']).agg('mean').reset_index()
k1 = k1.pivot(index='cust_id', columns='is_month_start', values='total_spend').reset_index().replace(np.nan,0)
k1.columns = ['cust_id', 'is_month_start0', 'is_month_start1']
train = train.merge(k1,on=['cust_id'],how='left')
test = test.merge(k1,on=['cust_id'],how='left')

#### Customer's total spend summary history  if month is started i.e. is_month_start1 and when the month is not started i.e. is_month_start0

In [61]:
k1.head()

Unnamed: 0,cust_id,is_month_start0,is_month_start1
0,4005940437,10.144331,9.088333
1,4005940737,9.021429,0.0
2,4005941333,29.044419,38.17
3,4005941434,7.838642,2.49
4,4005941533,27.858382,43.99


In [62]:
k1 = data[['cust_id','is_month_end','total_spend']].groupby(['cust_id','is_month_end']).agg('mean').reset_index()
k1 = k1.pivot(index='cust_id', columns='is_month_end', values='total_spend').reset_index().replace(np.nan,0)
k1.columns = ['cust_id', 'is_month_end0', 'is_month_end1']
train = train.replace(np.nan,-1)
test = test.replace(np.nan,-1)
train = train.merge(k1,on=['cust_id'],how='left')
test = test.merge(k1,on=['cust_id'],how='left')

#### Customer's total spend summary history  if month is ended i.e. is_month_start1 and when the month is not ended i.e. is_month_start0

In [64]:
k1.head()

Unnamed: 0,cust_id,is_month_end0,is_month_end1
0,4005940437,10.151395,8.3325
1,4005940737,9.021429,0.0
2,4005941333,27.783571,52.78
3,4005941434,7.781975,7.08
4,4005941533,28.488971,1.11


#### Above two cells analysis show that how much the customer active during month start, month end, month month not started, month not ended

In [65]:
# columns common 
[i for i in train.columns if i not in test.columns],[i for i in test.columns if i not in train.columns]

(['target'], [])

#### Applying Randome Forest Classifier

In [66]:
from sklearn.ensemble import RandomForestClassifier

  from .murmurhash import murmurhash3_32
  from .murmurhash import murmurhash3_32
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from ._logistic_sigmoid import _log_logistic_sigmoid
  from .sparsefuncs_fast import csr_row_norms
  from .sparsefuncs_fast import csr_row_norms
  from ._random import sample_without_replacement
  from ._random import sample_without_replacement
  from .expected_mutual_info_fast import expected_mutual_information
  from .expected_mutual_info_fast import expected_mutual_information
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
  from .pairwise_fast import _chi2_kernel_fast, _sparse_manhattan
  from .ball_tree import BallTree
  from .ball_tree import BallTree
  from .kd_tree import KDTree
  from .kd_tree import KDTree
  from ._criterion import Criterion
  from ._criterion import Criterion
  from ._splitter import Splitter
  from ._splitter import Splitter
  from ._tree import DepthFirstTreeBuilder
  from ._tree import DepthFirstTree

In [74]:
train.head()

Unnamed: 0,cust_id,order_date,target,month,weekofyear,is_month_start,is_month_end,day,dayofweek,cust_id_total_send_mean,...,m8_y,m9_y,m10_y,m11_y,cust_id_month_count,cust_id_count,is_month_start0,is_month_start1,is_month_end0,is_month_end1
0,4239597436,2015-03-20,1,3,12,False,False,20,4,10.897333,...,19.556,12.776667,12.01,12.124,5.0,60,11.029649,8.383333,10.881579,11.196667
1,4239597436,2015-03-21,1,3,12,False,False,21,5,10.897333,...,19.556,12.776667,12.01,12.124,5.0,60,11.029649,8.383333,10.881579,11.196667
2,4239597436,2015-03-22,0,3,12,False,False,22,6,10.897333,...,19.556,12.776667,12.01,12.124,5.0,60,11.029649,8.383333,10.881579,11.196667
3,4239597436,2015-03-23,0,3,13,False,False,23,0,10.897333,...,19.556,12.776667,12.01,12.124,5.0,60,11.029649,8.383333,10.881579,11.196667
4,4239597436,2015-03-24,0,3,13,False,False,24,1,10.897333,...,19.556,12.776667,12.01,12.124,5.0,60,11.029649,8.383333,10.881579,11.196667


In [79]:
# script for validation
from sklearn.metrics import roc_auc_score
def validation(train,test):
    y = train.target.values
    train = train.drop(['target','order_date'],axis=1)
    test = test.drop(['order_date'],axis=1)
    clf = RandomForestClassifier(max_depth=5,criterion='gini',n_estimators=100,n_jobs=-1)
    clf.fit(train,y)
    return clf.predict_proba(test)[:,1]

In [80]:
#0.5836903334386723
k = train.loc[train.order_date > pd.to_datetime('2016-03-20',format= '%Y-%m-%d'),'order_date'].values
t = len(np.unique(k))
print (t)
met =[]
for i in range(0,t):
    print(i)
    o = pd.to_datetime('2016-03-20',format= '%Y-%m-%d') + pd.DateOffset(i)
    ind1 = train.loc[train.order_date <= o].index
    ind2 = train.loc[train.order_date > o].index
    X_train = train.iloc[ind1]
    X_test = train.iloc[ind2]
    test_y = X_test.target.values
    X_test = X_test.drop('target',axis=1)
    print (X_train.shape,X_test.shape)
    pred_test_y = validation(X_train.copy(),X_test.copy())
    e = roc_auc_score(test_y, pred_test_y)
    print (e)
    met.append(e)
print ("mean",np.mean(met))
test.to_csv('7_day_op.csv',index = False)

7
0
((4583463, 59), (87423, 58))
0.5828653901416068
1
((4595952, 59), (74934, 58))
0.5810778253176773
2
((4608441, 59), (62445, 58))
0.581128019746006
3
((4620930, 59), (49956, 58))
0.5821004092992454
4
((4633419, 59), (37467, 58))
0.577798605207125
5
((4645908, 59), (24978, 58))
0.5735976564918094
6
((4658397, 59), (12489, 58))
0.5823846313647605
('mean', 0.5801360767954614)


### ROC score is 0.58

In [81]:
met

[0.5828653901416068,
 0.5810778253176773,
 0.581128019746006,
 0.5821004092992454,
 0.577798605207125,
 0.5735976564918094,
 0.5823846313647605]

In [86]:
clf = RandomForestClassifier(max_depth=4,criterion='gini',n_estimators=350,n_jobs=-1)
y = train.target.values
train1 = train.drop('target', axis = 1)
train1 = train1.drop('order_date',axis=1)
#test = test.drop('order_date',axis=1)
clf.fit(train1,y)
test1 = test.drop(['order_date'],axis=1)
predictions = clf.predict(test1)

In [101]:
probs = clf.predict_proba(test1)[:,1]

In [106]:
probs

array([0.14194216, 0.15401904, 0.15401904, ..., 0.00074873, 0.00074873,
       0.00074873])

### Apply Logistic Regression

In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
y = train.target.values
logreg.fit(train1, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [134]:
from sklearn.linear_model import LogisticRegression

In [135]:
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
kfold = model_selection.KFold(n_splits=5, random_state=7)
modelCV = LogisticRegression()
scoring = 'accuracy'
results = model_selection.cross_val_score(modelCV, train1, y, cv=kfold, scoring=scoring)
print("10-fold cross validation average accuracy: %.3f" % (results.mean()))

10-fold cross validation average accuracy: 0.939


### Logistic Regression achieved 93% accuracy