In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pprint import pprint
from sklearn import model_selection
from sklearn.linear_model import LinearRegression

In [None]:
def extract_features(dataset):
    grouped=dataset.groupby('customerID',as_index=False)
    invoice_count=grouped.agg({"invoiceNumber":"count"})
    invoice_count.columns=['customerID','total']
    
    delayed=dataset[(dataset.DaysLate>0)]
    delayed=delayed.groupby('customerID',as_index=False)
    delayed_invoice_count=delayed.agg({'invoiceNumber':'count'})
    delayed_invoice_count.columns=['customerID','delayed']
    
    delayed_days_avg=delayed.agg({'DaysLate':'mean'})
    delayed_days_avg.columns=['customerID','avgDaysDelayed']
    
    settled_days_avg=grouped.agg({'DaysToSettle':'mean'})
    settled_days_avg.columns=['customerID','avgDaysToSettle']
    
    invoice_count_stats=pd.merge(invoice_count,delayed_invoice_count,on='customerID',how='left').fillna(0)
    invoice_count_stats=invoice_count_stats.sort_values('customerID')
    invoice_count_stats['paid']=invoice_count_stats['total']-invoice_count_stats['delayed']
    invoice_count_stats['delayRatio']=(invoice_count_stats['delayed']/invoice_count_stats['total'])
    
    paid_tot=grouped.agg({"InvoiceAmount":"sum"})
    paid_tot.columns=['customerID','totalAmt']
    delayed_tot=delayed.agg({"InvoiceAmount":"sum"})
    delayed_tot.columns=['customerID','delayedAmt']
    
    invoice_amt_stats=pd.merge(paid_tot,delayed_tot,on='customerID',how='left').fillna(0)
    invoice_amt_stats['paidAmt']=invoice_amt_stats['totalAmt']-invoice_amt_stats['delayedAmt']
    invoice_amt_stats['delayAmtRatio']=(invoice_amt_stats['delayedAmt']/invoice_amt_stats['totalAmt'])
    
    payer_stats=pd.merge(invoice_count_stats,invoice_amt_stats,on="customerID",how='left')
    payer_stats=pd.merge(payer_stats,delayed_days_avg,on="customerID",how="left").fillna(0)
    payer_stats=pd.merge(payer_stats,settled_days_avg,on="customerID",how="left").fillna(0)
    
    dataset_new=pd.merge(dataset,payer_stats,on='customerID',how='left').fillna(0)
    custlist=payer_stats['customerID'].tolist()
    cat = {x: custlist.index(x) for x in custlist}
    dataset_new['cust']= dataset_new['customerID'].map(cat)
    dataset_new=dataset_new[['cust', 'InvoiceAmount','total','totalAmt','avgDaysToSettle','DaysToSettle']]
    
    dataset_new[['cust' ,'InvoiceAmount' ,'total' , 'totalAmt',
 'avgDaysToSettle', 'DaysToSettle']] = dataset_new[['cust', 'InvoiceAmount' ,'total', 'totalAmt',
 'avgDaysToSettle' ,'DaysToSettle']].apply(pd.to_numeric)
    
    return dataset_new



In [None]:
def train(filename):
    dataset = pd.read_csv(filename)
    dataset_new=extract_features(dataset)
    array=dataset_new.values
    X=array[:,0:5]
    Y=array[:,5]
    validation_size=0.20
    seed=7
    X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X,Y,test_size=validation_size,random_state=seed)
    lm=LinearRegression()
    lm.fit(X_train,Y_train)
    return lm

In [None]:
def validate():
    