In [15]:
! pip install psycopg2-binary



In [1]:
! pip install imbalanced-learn



In [14]:
! pip install sklearn



In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.ensemble import BalancedRandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

from sklearn.linear_model import Lasso, LassoCV
from sklearn.metrics import mean_squared_error
from datetime import timedelta, datetime
import pickle

In [2]:
engine = create_engine('postgresql+psycopg2://dev_db_user:dev_db_pass@postgres_db:5432/dev_db')

In [3]:
df = pd.read_sql('SELECT * FROM machine_learning.cleaned_aggregated_data WHERE AD_Client_ID = 1000298', engine)

In [5]:
df.head(5)

Unnamed: 0,id,ad_client_id,ad_org_id,dateinvoiced,c_bpartner_id,c_invoice_id,c_bpartner_location_id,paymentrule,grandtotal,duedate,...,avg_late_paid,std_late_paid,max_unpaid,min_unpaid,avg_unpaid,std_unpaid,max_late_unpaid,min_late_unpaid,avg_late_unpaid,std_late_unpaid
0,519690,1000298,1002402,2013-09-09,1044431,1172123,1042886,P,105.61,2013-09-09,...,105.61,,0.0,0.0,0.0,,0.0,0.0,0.0,
1,519691,1000298,1002402,2013-09-09,1044423,1172126,1042878,P,53.8,2013-09-09,...,53.8,,0.0,0.0,0.0,,0.0,0.0,0.0,
2,536479,1000298,1002402,2015-04-09,1135360,1341749,1133009,P,44.89,2015-04-09,...,44.89,,0.0,0.0,0.0,,0.0,0.0,0.0,
3,519692,1000298,1002402,2013-09-09,1044322,1172129,1042773,P,51.85,2013-09-09,...,90.85,55.154329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,624391,1000298,1015531,2015-05-20,1142434,1358235,1140423,P,174.0,2015-05-20,...,,,174.0,174.0,174.0,,174.0,174.0,174.0,


In [4]:
derived_features =['closed_late_invoices_no',
                  'paid_late_percent',
                  'paid_late_total',
                  'paid_late_raport_percent',
                  'avg_days_paid_late',
                  'late_unpaid_invoices_no',
                  'late_unpaid_invoices_percent',
                  'unpaid_invoices_late_sum',
                  'late_unpaid_invoices_sum_percent'
               ] + [x + '_paid' for x in ['max', 'min', 'avg', 'std']] \
                 + [x + '_late_paid' for x in ['max', 'min', 'avg', 'std']] \
                 + [x + '_unpaid' for x in ['max', 'min', 'avg', 'std']] \
                 + [x + '_late_unpaid' for x in ['max', 'min', 'avg', 'std']]
# numeric_features = [c for c in features if df[c].dtype!=object ]
# categorical_features = [c for c in features if df[c].dtype==object]
features = derived_features + ['late','dayslate','totalopenamt', 'paymentrule', 'tendertype']
target = 'paid'

numeric_features = [c for c in features if df[c].dtype!=object ]
categorical_features = [c for c in features if df[c].dtype==object]

target_reg = 'daystosettle'
features_reg =[f for f in features if f not in [target, target_reg]]

numeric_features_reg = [c for c in features_reg if df[c].dtype!=object]
categorical_features_reg = [c for c in features_reg if df[c].dtype==object]
cols = numeric_features_reg + categorical_features_reg +[target_reg, target, 'dateinvoiced']
df[ ['c_invoice_id', 'ad_org_id', 'dateinvoiced', 'c_bpartner_id',
       'c_bpartner_location_id'] + features + [target, target_reg]].drop_duplicates().to_csv('atribute_cbpartner.csv', index = False)

In [5]:
# ML Pipeline
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0)),
    ('scaler', StandardScaler())])
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse = False))])

In [6]:
def split_clasificare(n_years, df, features, target):
# pentru clasificare
    X_train = df[df['dateinvoiced'] <= df['dateinvoiced'].max() + timedelta(days = -365*n_years)][features]
    y_train = df[df['dateinvoiced'] <= df['dateinvoiced'].max() + timedelta(days = -365*n_years)][target]

    X_test = df[(df['dateinvoiced'] > df['dateinvoiced'].max() + timedelta(days = -365*n_years)) &
               (df['dateinvoiced'] <= df['dateinvoiced'].max() + timedelta(days = -365*(n_years-1)))][features]
    y_test = df[(df['dateinvoiced'] > df['dateinvoiced'].max() + timedelta(days = -365*n_years)) &
               (df['dateinvoiced'] <= df['dateinvoiced'].max() + timedelta(days = -365*(n_years-1)))][target]
    return X_train, y_train, X_test, y_test

In [7]:
def split_regresie(n_years, df, features_reg, target_reg):


    X_train_reg = df[(df.paid==1) & 
                     (df['dateinvoiced'] <= df['dateinvoiced'].max() + timedelta(days = -365*n_years))][features_reg]
    Y_train_reg = df[(df.paid==1) & 
                     (df['dateinvoiced'] <= df['dateinvoiced'].max() + timedelta(days = -365*n_years))][target_reg]

    X_test_reg = df[(df.paid==1) & (df['dateinvoiced'] > df['dateinvoiced'].max() + timedelta(days = -365*n_years)) &
               (df['dateinvoiced'] <= df['dateinvoiced'].max() + timedelta(days = -365*(n_years-1)))][features_reg]

    Y_test_reg = df[(df.paid==1) & (df['dateinvoiced'] > df['dateinvoiced'].max() + timedelta(days = -365*n_years)) &
               (df['dateinvoiced'] <= df['dateinvoiced'].max() + timedelta(days = -365*(n_years-1)))][target_reg]
    X_train_reg.fillna(value=X_train_reg.mean(), inplace=True)
    Y_train_reg.fillna(value=Y_train_reg.mean(), inplace=True)
    X_test_reg.fillna(value=X_test_reg.mean(), inplace=True)
    Y_test_reg.fillna(value=Y_test_reg.mean(), inplace=True)
    return X_train_reg, Y_train_reg, X_test_reg, Y_test_reg

In [8]:
def antrenare_clasificare(df, numeric_features, categorical_features, target, n_years):
 
    X_train, y_train, X_test, y_test = split_clasificare(n_years, df, numeric_features + categorical_features, target)
    # la stringul 'classifier' adaugam '__' si sufixul reprezentat de parametrul
    param_grid = [

       {'classifier' : [BalancedRandomForestClassifier()],
        'classifier__n_estimators' : [10,100,500],
        'classifier__max_samples' : [0.1, 0.2, 0.3]},
       # mai multe modele de incercat aici: logistic regression, XGBoost, SVM
    ]

    # bucla pentru tunarea si evaluarea clasificatorilor. returnam metrici pe care le putem colecta intr-un dataframe
    lista_rezultate = []
    lista_obiecte_grid_search = []

    for clf_dict in param_grid:

        # pasului de preprocesare ii adaugam clasificatorul curent; nu putem adauga decat un singur evaluator in
        # pipeline, la sfarsit; de aceea am recurs la bucla
        clsf = Pipeline(steps=[        
                               ( 'column_transformer', ColumnTransformer(
                            transformers=[
                                ('num', numeric_transformer, 
                                 [list(X_train.columns.values).index(e) for e in numeric_features]),
                                ('cat', categorical_transformer, 
                                  [list(X_train.columns.values).index(e) for e in categorical_features])],
                               remainder='passthrough')                                         

                           ),
                              ('classifier', clf_dict['classifier'][0])])

        grid = GridSearchCV(clsf, [clf_dict], cv=5, scoring='accuracy', verbose=0, n_jobs=-1)

        # antrenare si fit
        grid.fit(X_train, y_train)   

        # predictie cu cea mai buna configurare in functie de scoringul ales - in cazul de mai sus accuracy
        y_pred = grid.predict(X_test)

        # curba roc
        fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred, pos_label=2)

        # matricea de confuzie
        cfmtrx = metrics.confusion_matrix(y_test,y_pred)

        # culegem rezultatele in dictionare cu aceleasi chei - cel mai usor de transformat
        lista_rezultate.append({
            'classifierName': type(clf_dict['classifier'][0]),
            'best_score':grid.best_score_,
            'best_params':grid.best_params_,
            'auc':metrics.roc_auc_score(y_test,y_pred),
            'precision':metrics.precision_score(y_test,y_pred),
            'recall':metrics.recall_score(y_test,y_pred),
            'f1':metrics.f1_score(y_test,y_pred),
            'FP':cfmtrx[0,1],
            'FN':cfmtrx[1,0],
            'TP':cfmtrx[1,1],
            'TN':cfmtrx[0,0],
            'grid_obj':grid

        })
        print('Done with {0}'.format(type(clf_dict['classifier'][0])))

    df_result = pd.DataFrame(lista_rezultate)
    df_result = df_result.sort_values(by ='best_score', ascending = False)
    print(df_result.head())
    # cel mai bun model
    clsf = df_result.sort_values(by ='best_score', ascending = False).grid_obj[0].best_estimator_
    # testarea: 
    clsf.fit(X_train,y_train)
    y_pred = clsf.predict(X_test)
    print("model score: %.3f" % clsf.score(X_test, y_test))
    c = confusion_matrix(y_test,y_pred)
    
    return clsf, df_result

In [9]:
def antrenare_regresie(df, numeric_features_reg,categorical_features_reg, target_reg, n_years ):
    
    cols = [c for c in df if c in numeric_features_reg + categorical_features_reg +[target_reg]]
    features_reg = numeric_features_reg + categorical_features_reg
    
    X_train_reg, y_train_reg, X_test_reg, y_test_reg = split_regresie(n_years, df, features_reg, target_reg)

    # lasso are un CV propriu. fapt pentru care standardizarea se va face acolo si o scoatem din preprocesare
    numeric_transformer2 = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value=0))])
    
    categorical_transformer2 = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore',sparse = False))])
    
    datatrsfm = Pipeline(steps=[        
                           ( 'column_transformer', ColumnTransformer(
                        transformers=[
                            ('num', numeric_transformer2, 
                             [list(df[cols].columns.values).index(e) for e in numeric_features_reg]),
                            ('cat', categorical_transformer2, 
                              [list(df[cols].columns.values).index(e) for e in categorical_features_reg])],
                           remainder='passthrough'
                           )                                         

                       )])

    lambda_values = 10**np.linspace(10,-3,100)*0.5
    X_train_reg_trf = datatrsfm.fit_transform(X_train_reg)

    lassocv = LassoCV(alphas = lambda_values, cv = 10, max_iter = 100000, normalize = True)

    # antrenare si fit
    lassocv.fit(X_train_reg_trf, y_train_reg)   

    #pas 5
    lambda_optim = lassocv.alpha_

    # pas 6: reantrenare pe toate datele de antrenament
    lasso = Lasso(max_iter = 100000, normalize = True)
    lasso.set_params(alpha=lambda_optim)
    lasso.fit(X_train_reg_trf, y_train_reg)

    # pas 7 - raportatea erorii

    pipeline_reg = Pipeline(steps=[        
                       ( 'column_transformer', ColumnTransformer(
                    transformers=[
                        ('num', numeric_transformer, 
                         [list(df[cols].columns.values).index(e) for e in numeric_features_reg]),
                        ('cat', categorical_transformer, 
                          [list(df[cols].columns.values).index(e) for e in categorical_features_reg])],
                       remainder='passthrough') ),
                     ('lasso',lasso)   

                   ])

    pipeline_reg.fit(X_train_reg, y_train_reg)
    y_hat_train_reg = pipeline_reg.predict(X_train_reg)
    y_hat_test_reg = pipeline_reg.predict(X_test_reg)


    cv_alpha_test_error = mean_squared_error(y_train_reg, 
                                             y_hat_train_reg)

    print("MSE date test :", cv_alpha_test_error)

    # score(X, y[, sample_weight]) - coeficientul de determinare R^2 all predictiei.
    print("R^2 pe date de anternament: {0}".format(pipeline_reg.score(X_train_reg,y_train_reg)))
    print("R^2 pe date de test: {0}".format(pipeline_reg.score(X_test_reg,y_test_reg)))
    return pipeline_reg

In [10]:
for n_years in [1,2,3,4]:

    print('n_years: ', n_years)
    model_clasificare = antrenare_clasificare(df, numeric_features, categorical_features, target, n_years)
    model_regresie = antrenare_regresie(df[cols].reset_index(), numeric_features_reg, categorical_features_reg, target_reg, n_years)

    filename_cslf = 'clasificator_facturi_n_years' + str(n_years) + '.sav'
    pickle.dump(model_clasificare, open(filename_cslf, 'wb'))
    filename_reg = 'regresor_DaysToSettle_n_years' + str(n_years) + '.sav'
    pickle.dump(model_regresie, open(filename_reg, 'wb'))

n_years:  1
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Done with <class 'imblearn.ensemble._forest.BalancedRandomForestClassifier'>
                                      classifierName  best_score  \
0  <class 'imblearn.ensemble._forest.BalancedRand...    0.991545   

                                         best_params       auc  precision  \
0  {'classifier': BalancedRandomForestClassifier(...  0.809524    0.99782   

   recall        f1  FP  FN     TP  TN  \
0     1.0  0.998909  24   0  10986  39   

                                            grid_obj  
0  GridSearchCV(cv=5,\n             estimator=Pip...  
model score: 0.998


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


MSE date test : 70.10388673761412
R^2 pe date de anternament: 0.9226173117758341
R^2 pe date de test: 0.8565673022856045
n_years:  2
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Done with <class 'imblearn.ensemble._forest.BalancedRandomForestClassifier'>
                                      classifierName  best_score  \
0  <class 'imblearn.ensemble._forest.BalancedRand...    0.998786   

                                         best_params       auc  precision  \
0  {'classifier': BalancedRandomForestClassifier(...  0.998934   0.997492   

   recall        f1  FP  FN    TP     TN  \
0     1.0  0.998744  23   0  9148  10765   

                                            grid_obj  
0  GridSearchCV(cv=5,\n             estimator=Pip...  
model score: 0.999


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


MSE date test : 69.51459951918129
R^2 pe date de anternament: 0.9277066389158298
R^2 pe date de test: 0.7756836646406186
n_years:  3
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Done with <class 'imblearn.ensemble._forest.BalancedRandomForestClassifier'>
                                      classifierName  best_score  \
0  <class 'imblearn.ensemble._forest.BalancedRand...    0.989742   

                                         best_params       auc  precision  \
0  {'classifier': BalancedRandomForestClassifier(...  0.999967   0.999801   

   recall        f1  FP  FN     TP     TN  \
0     1.0  0.999901   3   0  15075  45325   

                                            grid_obj  
0  GridSearchCV(cv=5,\n             estimator=Pip...  
model score: 0.999


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


MSE date test : 59.65126506000202
R^2 pe date de anternament: 0.9333707620348911
R^2 pe date de test: 0.9006904242789626
n_years:  4
Fitting 5 folds for each of 9 candidates, totalling 45 fits




Done with <class 'imblearn.ensemble._forest.BalancedRandomForestClassifier'>
                                      classifierName  best_score  \
0  <class 'imblearn.ensemble._forest.BalancedRand...    0.997402   

                                         best_params  auc  precision  recall  \
0  {'classifier': BalancedRandomForestClassifier(...  1.0        1.0     1.0   

    f1  FP  FN     TP     TN  \
0  1.0   0   0  19367  24499   

                                            grid_obj  
0  GridSearchCV(cv=5,\n             estimator=Pip...  
model score: 1.000


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


MSE date test : 74.50165594275656
R^2 pe date de anternament: 0.9172637765547187
R^2 pe date de test: 0.9694114693569503


In [23]:
df_test = df[['c_invoice_id', 'ad_org_id', 'dateinvoiced', 'c_bpartner_id',
       'c_bpartner_location_id', 'paymentrule', 'grandtotal', 'duedate','totalopenamt',
        'dayslate','late']]
df_atribute_atribute_cbpartner = df[['c_invoice_id', 'ad_org_id', 'dateinvoiced', 'c_bpartner_id',
       'c_bpartner_location_id'] + features + [target, target_reg]].drop_duplicates()

In [38]:
# features = derived_features + ['late','dayslate','totalopenamt', 'paymentrule', 'tendertype']

In [11]:
df_test_profilat = df_test.merge( df_atribute_atribute_cbpartner, 
                        on = ['c_invoice_id', 'ad_org_id',  'c_bpartner_id', 'c_bpartner_location_id'],
                       how = 'left',
                                suffixes = [None,"_y"])

# construire set atribute necesare rularii
X = df_test_profilat [ features]

In [20]:
filename_cslf = 'clasificator_facturi_n_years1.sav'

#pipeline_clsf = Pipeline(steps = [])
pipeline_clsf = pickle.load(open(filename_cslf, 'rb'))
y_hat_clsf = pipeline_clsf[0].predict(X)
df_test['predictie_paid'] = y_hat_clsf

filename_reg = 'regresor_DaysToSettle_n_years4.sav'

#pipeline_reg = Pipeline(steps = [])
pipeline_reg = pickle.load(open(filename_reg, 'rb'))

y_hat_reg = pipeline_reg.predict(X)

# salvare prognoze
df_test['predictie_paid'] = y_hat_clsf
df_test['predictie_DaysToSettle'] = y_hat_reg
df_test['predictie_DaysToSettle'] = df_test[['predictie_DaysToSettle','predictie_paid']].apply(lambda x : None
                                                                                               if x[1] == 0 else round(x[0],0),
                                                                                              axis = 1)
df_test[df_test.predictie_paid == 0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predictie_paid'] = y_hat_clsf
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predictie_paid'] = y_hat_clsf
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predictie_DaysToSettle'] = y_hat_reg
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Unnamed: 0,c_invoice_id,ad_org_id,dateinvoiced,c_bpartner_id,c_bpartner_location_id,paymentrule,grandtotal,duedate,totalopenamt,dayslate,late,predictie_paid,predictie_DaysToSettle
4,1358235,1015531,2015-05-20,1142434,1140423,P,174.00,2015-05-20,174.00,2084.0,1.0,0.0,
8,1378367,1015531,2015-07-13,1148845,1147010,P,210.00,2015-07-13,210.00,2030.0,1.0,0.0,
10,1378369,1015531,2015-07-13,1148850,1147015,P,210.00,2015-07-13,210.00,2030.0,1.0,0.0,
11,1378371,1015531,2015-07-13,1148855,1147020,P,210.00,2015-07-13,210.00,2030.0,1.0,0.0,
14,1377336,1012210,2015-07-08,1148727,1146737,P,94.00,2015-07-08,94.00,2035.0,1.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
217634,3357757_0,1002402,2019-10-07,1043629,2517736,P,9.00,2019-10-07,9.00,483.0,1.0,0.0,
217658,3411216_0,1002402,2019-10-30,2276882,1633247,P,27.80,2019-10-30,27.80,460.0,1.0,0.0,
217662,3415973_0,1002402,2019-11-04,3957900,3279771,P,0.10,2019-11-04,0.10,455.0,1.0,0.0,
217672,3420538_0,1002402,2019-11-06,2276882,1633247,P,121.88,2019-11-06,121.88,453.0,1.0,0.0,
