## Getting Started

In [2]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

In [3]:
data = pd.read_csv('investing_program_prediction_data.csv')

In [9]:
data

Unnamed: 0,SE1,SE2,BA1,BA2,BA3,BA4,BA5,BA6,BA7,PE1,PE2,PE3,PE4,PE5,PE6,PE7,PE8,PE9,PE10,PE11,PE12,PE13,PE14,PE15,IA1,IA2,IA3,InvType
0,45,G29,12,0.0,5934,0,0,0,0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,0,0,1,C1
1,41,G0,0,0.0,21514,316374,5142,0,5141,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I1,I1,I1,I0,0,0,1,C1
2,43,G0,1,0.0,14324,22344,22344,0,22344,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,0,0,0,C1
3,36,G52,16,0.0,41114,2406838,1156219,692875,41425,I0,I0,I0,I0,I1,I1,I1,I0,I0,I1,I1,I1,I1,I0,I0,0,0,0,C1
4,2,G10,0,0.0,2394,621,621,0,621,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,0,0,0,C1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4729,57,G23,1,0.0,12514,14274,14274,0,14274,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,0,0,0,C0
4730,34,G66,0,0.0,5994,1036,1036,0,1036,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,0,0,0,C1
4731,44,G51,19,0.0,69627,4805913,3345950,1899126,596537,I0,I0,I0,I1,I1,I0,I1,I0,I0,I1,I1,I1,I1,I1,I0,0,0,1,C1
4732,22,G0,0,0.0,5999,0,0,0,0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,I0,0,0,0,C1


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4734 entries, 0 to 4733
Data columns (total 28 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   SE1      4734 non-null   int64  
 1   SE2      4734 non-null   object 
 2   BA1      4734 non-null   int64  
 3   BA2      4734 non-null   float64
 4   BA3      4734 non-null   int64  
 5   BA4      4734 non-null   int64  
 6   BA5      4734 non-null   int64  
 7   BA6      4734 non-null   int64  
 8   BA7      4734 non-null   int64  
 9   PE1      4734 non-null   object 
 10  PE2      4734 non-null   object 
 11  PE3      4734 non-null   object 
 12  PE4      4734 non-null   object 
 13  PE5      4734 non-null   object 
 14  PE6      4734 non-null   object 
 15  PE7      4734 non-null   object 
 16  PE8      4734 non-null   object 
 17  PE9      4734 non-null   object 
 18  PE10     4734 non-null   object 
 19  PE11     4734 non-null   object 
 20  PE12     4734 non-null   object 
 21  PE13     4734 

In [14]:
data.isna().sum().sum()

0

## Data Preprocessing

In [30]:
def preprocess_inputs(df):
    df = df.copy()

    # remove non-numeric characters in 'PE' columns
    for column in ['PE' + str(i) for i in range(1, 16)]:
        df[column] = df[column].apply(lambda x: x[1]).astype(np.int)

    # onehot encode 'SE2' column
    geo_dummies = pd.get_dummies(df['SE2'])
    df = pd.concat([df, geo_dummies], axis=1)
    df = df.drop('SE2', axis=1)

    # engineer 'SE1'(age) column
    df['Child'] = df['SE1'].apply(lambda x: 1 if x < 18 else 0)
    df['Senior'] = df['SE1'].apply(lambda x: 1 if x >= 65 else 0)

    # engineer 'BA'(banking activities) columns
    for column in ['BA' + str(i) for i in range(1, 8)]:
        df['Low_' + column] = df[column].apply(lambda x: 1 if x <= df[column].quantile(0.25) else 0)

    # split into 'X' & 'y'
    X = df.drop('InvType', axis=1)
    y = df['InvType']

    # train-test-split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    # scaling X
    scaler = StandardScaler()
    scaler.fit(X_train)

    X_train = pd.DataFrame(scaler.transform(X_train), columns=X.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)
    
    return X_train, X_test, y_train, y_test

In [31]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df[column] = df[column].apply(lambda x: x[1]).astype(np.int)


In [33]:
X_train

Unnamed: 0,SE1,BA1,BA2,BA3,BA4,BA5,BA6,BA7,PE1,PE2,PE3,PE4,PE5,PE6,PE7,PE8,PE9,PE10,PE11,PE12,PE13,PE14,PE15,IA1,IA2,IA3,G0,G10,G11,G12,G13,G14,G15,G16,G17,G18,G19,G2,G20,G21,G22,G23,G24,G25,G26,G27,G28,G29,G30,G31,G32,G33,G34,G35,G36,G37,G38,G39,G40,G42,G43,G44,G45,G46,G47,G48,G49,G50,G51,G52,G53,G54,G55,G56,G58,G59,G60,G61,G62,G63,G64,G65,G66,G67,G68,G69,G70,G71,G72,G73,G74,G75,G76,G77,G78,G79,G80,G81,G82,G83,G84,G85,G86,G87,G88,G89,G90,G91,G92,G93,G94,G95,G96,G97,G98,G99,Child,Senior,Low_BA1,Low_BA2,Low_BA3,Low_BA4,Low_BA5,Low_BA6,Low_BA7
0,-0.853711,-0.507149,-0.0665,-0.429064,-0.029207,0.343929,-0.281583,-0.391727,-0.235439,-0.057718,-0.087198,-0.077933,1.480444,-0.376333,-0.262782,-0.294255,-0.205309,1.506812,-0.526009,-0.506783,-0.602806,-0.298473,-0.258157,-0.052455,0.0,0.693348,-0.569334,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,9.409598,-0.069663,-0.092323,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,-0.144760,-0.024577,-0.095593,-0.162272,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,-0.095593,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,-0.193861,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,-0.876824,0.113314,1.743630,-0.571657,-0.571657,0.644366,1.726838
1,-1.359483,-0.638075,-0.0665,-0.014772,-0.417519,-0.309869,-0.281583,-0.192492,-0.235439,-0.057718,-0.087198,-0.077933,-0.675473,-0.376333,-0.262782,-0.294255,-0.205309,-0.663653,-0.526009,-0.506783,-0.602806,-0.298473,-0.258157,-0.052455,0.0,-0.525899,-0.569334,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,-0.106274,-0.069663,-0.092323,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,-0.144760,-0.024577,-0.095593,-0.162272,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,10.461039,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,-0.193861,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,1.140480,0.113314,-0.573516,-0.571657,-0.571657,0.644366,-0.579093
2,0.347499,-0.638075,-0.0665,-0.391779,-0.272048,-0.064940,-0.281583,0.523943,-0.235439,-0.057718,-0.087198,-0.077933,-0.675473,-0.376333,-0.262782,-0.294255,-0.205309,-0.663653,-0.526009,-0.506783,-0.602806,-0.298473,-0.258157,-0.052455,0.0,-0.525899,-0.569334,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,-0.106274,-0.069663,-0.092323,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,-0.144760,-0.024577,-0.095593,-0.162272,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,-0.095593,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,5.158327,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,1.140480,0.113314,-0.573516,-0.571657,-0.571657,0.644366,-0.579093
3,0.537164,-0.638075,-0.0665,-0.194794,-0.413902,-0.303780,-0.281583,-0.174679,-0.235439,-0.057718,-0.087198,-0.077933,-0.675473,-0.376333,-0.262782,-0.294255,-0.205309,-0.663653,-0.526009,-0.506783,-0.602806,-0.298473,-0.258157,-0.052455,0.0,-0.525899,1.756440,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,-0.106274,-0.069663,-0.092323,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,-0.144760,-0.024577,-0.095593,-0.162272,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,-0.095593,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,-0.193861,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,1.140480,0.113314,-0.573516,-0.571657,-0.571657,0.644366,-0.579093
4,0.284278,0.409339,-0.0665,-0.209624,1.256704,1.374897,1.642699,0.777796,-0.235439,-0.057718,-0.087198,-0.077933,1.480444,-0.376333,3.805431,-0.294255,-0.205309,1.506812,1.901107,-0.506783,1.658910,-0.298473,-0.258157,17.325995,0.0,0.693348,1.756440,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,-0.106274,-0.069663,-0.092323,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,-0.144760,-0.024577,-0.095593,-0.162272,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,-0.095593,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,-0.193861,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,-0.876824,0.113314,-0.573516,-0.571657,-0.571657,-1.551914,-0.579093
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3308,0.410721,1.325826,-0.0665,-0.344302,-0.204370,0.048992,1.566391,0.857197,-0.235439,-0.057718,-0.087198,-0.077933,-0.675473,-0.376333,-0.262782,3.398418,-0.205309,1.506812,-0.526009,-0.506783,1.658910,-0.298473,-0.258157,-0.052455,0.0,5.570339,-0.569334,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,-0.106274,-0.069663,-0.092323,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,-0.144760,-0.024577,-0.095593,6.162505,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,-0.095593,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,-0.193861,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,-0.876824,0.113314,-0.573516,-0.571657,-0.571657,-1.551914,-0.579093
3309,-0.537603,-0.114368,-0.0665,-0.290602,1.215995,1.039791,-0.281583,-0.317906,-0.235439,-0.057718,-0.087198,-0.077933,1.480444,2.657222,-0.262782,-0.294255,-0.205309,1.506812,1.901107,-0.506783,-0.602806,-0.298473,-0.258157,-0.052455,0.0,-0.525899,1.756440,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,-0.106274,-0.069663,-0.092323,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,-0.144760,-0.024577,-0.095593,-0.162272,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,-0.095593,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,-0.193861,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,-0.876824,0.113314,-0.573516,-0.571657,-0.571657,0.644366,-0.579093
3310,0.853272,-0.638075,-0.0665,0.059803,-0.369843,-0.229597,-0.281583,0.042311,-0.235439,-0.057718,-0.087198,-0.077933,-0.675473,-0.376333,-0.262782,-0.294255,-0.205309,-0.663653,-0.526009,-0.506783,-0.602806,-0.298473,-0.258157,-0.052455,0.0,-0.525899,-0.569334,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,-0.106274,-0.069663,-0.092323,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,6.908009,-0.024577,-0.095593,-0.162272,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,-0.095593,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,-0.193861,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,1.140480,0.113314,-0.573516,-0.571657,-0.571657,0.644366,-0.579093
3311,-1.296262,0.147485,-0.0665,-0.350868,-0.459933,-0.381282,-0.281583,-0.401382,-0.235439,-0.057718,-0.087198,-0.077933,-0.675473,-0.376333,-0.262782,-0.294255,-0.205309,-0.663653,-0.526009,-0.506783,1.658910,-0.298473,-0.258157,-0.052455,0.0,0.693348,-0.569334,-0.095593,-0.052192,-0.073911,-0.046015,-0.069663,-0.065144,-0.073911,-0.073911,-0.073911,-0.075948,0.0,-0.128723,-0.093972,-0.106274,-0.069663,10.831502,-0.100304,-0.11866,-0.075948,-0.057718,-0.060293,-0.138098,-0.034768,-0.103331,-0.06744,-0.154256,-0.081761,-0.104812,-0.083611,-0.083611,-0.030106,-0.095593,-0.139229,-0.103331,-0.176419,-0.090646,-0.134653,-0.085423,-0.071818,-0.144760,-0.024577,-0.095593,-0.162272,-0.092323,-0.038878,-0.097188,-0.055023,-0.156294,-0.100304,-0.071818,-0.052192,-0.085423,-0.081761,-0.100304,-0.057718,-0.055023,-0.098758,-0.034768,-0.138098,-0.079869,-0.095593,-0.071818,-0.06744,-0.073911,-0.155278,-0.129929,-0.097188,-0.092323,-0.087198,-0.024577,-0.057718,-0.046015,-0.017376,-0.104812,-0.092323,-0.038878,-0.030106,-0.060293,0.0,-0.193861,-0.069663,-0.062765,-0.141465,-0.060293,-0.079869,-0.106274,-0.117345,-0.077933,-0.092323,-0.252118,-0.309142,-0.876824,0.113314,-0.573516,1.749300,1.749300,0.644366,1.726838


## Model Training

In [32]:
model = LogisticRegression()
model.fit(X_train, y_train)

acc = model.score(X_test, y_test)
print('Test Accuracy: {:.2f}%'.format(acc * 100))

Test Accuracy: 71.15%


### _Model Accuracy :_
- With no feature engineering: __67.98%__

- With feature engineering: __71.15%__