In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, SelectKBest, RFE, RFECV, VarianceThreshold
from sklearn.compose import ColumnTransformer
from sklearn.metrics import precision_score, recall_score, f1_score, make_scorer
from sklearn.pipeline import Pipeline

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
from sklearn.exceptions import ConvergenceWarning
simplefilter("always", ConvergenceWarning)

In [2]:
data = pd.read_csv('Train_nyOWmfK.csv', encoding='latin1')

In [3]:
data.shape

(87020, 26)

In [4]:
data.head(10)

Unnamed: 0,ID,Gender,City,Monthly_Income,DOB,Lead_Creation_Date,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Employer_Name,...,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Filled_Form,Device_Type,Var2,Source,Var4,LoggedIn,Disbursed
0,ID000002C20,Female,Delhi,20000,23-May-78,15-May-15,300000.0,5.0,0.0,CYBOSOL,...,,,,N,Web-browser,G,S122,1,0,0
1,ID000004E40,Male,Mumbai,35000,07-Oct-85,04-May-15,200000.0,2.0,0.0,TATA CONSULTANCY SERVICES LTD (TCS),...,13.25,,6762.9,N,Web-browser,G,S122,3,0,0
2,ID000007H20,Male,Panchkula,22500,10-Oct-81,19-May-15,600000.0,4.0,0.0,ALCHEMIST HOSPITALS LTD,...,,,,N,Web-browser,B,S143,1,0,0
3,ID000008I30,Male,Saharsa,35000,30-Nov-87,09-May-15,1000000.0,5.0,0.0,BIHAR GOVERNMENT,...,,,,N,Web-browser,B,S143,3,0,0
4,ID000009J40,Male,Bengaluru,100000,17-Feb-84,20-May-15,500000.0,2.0,25000.0,GLOBAL EDGE SOFTWARE,...,,,,N,Web-browser,B,S134,3,1,0
5,ID000010K00,Male,Bengaluru,45000,21-Apr-82,20-May-15,300000.0,5.0,15000.0,COGNIZANT TECHNOLOGY SOLUTIONS INDIA PVT LTD,...,13.99,1500.0,6978.92,N,Web-browser,B,S143,3,1,0
6,ID000011L10,Female,Sindhudurg,70000,23-Oct-87,01-May-15,6.0,5.0,0.0,CARNIVAL CRUISE LINE,...,,,,N,Web-browser,B,S133,1,0,0
7,ID000012M20,Male,Bengaluru,20000,25-Jul-75,20-May-15,200000.0,5.0,2597.0,GOLDEN TULIP FLORITECH PVT. LTD,...,,,,N,Web-browser,B,S159,3,0,0
8,ID000013N30,Male,Kochi,75000,26-Jan-72,02-May-15,0.0,0.0,0.0,SIIS PVT LTD,...,14.85,26000.0,30824.65,Y,Mobile,C,S122,5,0,0
9,ID000014O40,Female,Mumbai,30000,12-Sep-89,03-May-15,300000.0,3.0,0.0,SOUNDCLOUD.COM,...,18.25,1500.0,10883.38,N,Web-browser,B,S133,1,0,0


In [5]:
data.dtypes

ID                        object
Gender                    object
City                      object
Monthly_Income             int64
DOB                       object
Lead_Creation_Date        object
Loan_Amount_Applied      float64
Loan_Tenure_Applied      float64
Existing_EMI             float64
Employer_Name             object
Salary_Account            object
Mobile_Verified           object
Var5                       int64
Var1                      object
Loan_Amount_Submitted    float64
Loan_Tenure_Submitted    float64
Interest_Rate            float64
Processing_Fee           float64
EMI_Loan_Submitted       float64
Filled_Form               object
Device_Type               object
Var2                      object
Source                    object
Var4                       int64
LoggedIn                   int64
Disbursed                  int64
dtype: object

In [6]:
data.describe(percentiles=[0.2, 0.4, 0.6, 0.8])

Unnamed: 0,Monthly_Income,Loan_Amount_Applied,Loan_Tenure_Applied,Existing_EMI,Var5,Loan_Amount_Submitted,Loan_Tenure_Submitted,Interest_Rate,Processing_Fee,EMI_Loan_Submitted,Var4,LoggedIn,Disbursed
count,87020.0,86949.0,86949.0,86949.0,87020.0,52407.0,52407.0,27726.0,27420.0,27726.0,87020.0,87020.0,87020.0
mean,58849.97,230250.7,2.131399,3696.228,4.961503,395010.6,3.891369,19.197474,5131.150839,10999.528377,2.949805,0.02935,0.014629
std,2177511.0,354206.8,2.014193,39810.21,5.670385,308248.1,1.165359,5.834213,4725.837644,7512.32305,1.69772,0.168785,0.120062
min,0.0,0.0,0.0,0.0,0.0,50000.0,1.0,11.99,200.0,1176.41,0.0,0.0,0.0
20%,15000.0,0.0,0.0,0.0,0.0,190000.0,3.0,14.85,2000.0,5422.58,1.0,0.0,0.0
40%,20609.2,100000.0,1.0,0.0,1.0,290000.0,4.0,16.35,3000.0,8342.71,3.0,0.0,0.0
50%,25000.0,100000.0,2.0,0.0,2.0,300000.0,4.0,18.0,4000.0,9392.97,3.0,0.0,0.0
60%,28000.0,200000.0,3.0,0.0,3.0,370000.0,4.0,18.4,4400.0,10597.55,3.0,0.0,0.0
80%,45000.0,300000.0,5.0,5000.0,11.0,500000.0,5.0,20.0,7500.0,14362.93,5.0,0.0,0.0
max,444554400.0,10000000.0,10.0,10000000.0,18.0,3000000.0,6.0,37.0,50000.0,144748.28,7.0,1.0,1.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(["LoggedIn", "Disbursed"], axis=1),
                                          data.Disbursed,
                                          test_size=10000,
                                          random_state=123)

In [14]:
def  Preproces(data):
    data = data.drop(["ID", "Lead_Creation_Date", "Employer_Name", "City", "Salary_Account"], axis=1)
    data["Gender"].replace({'Female': 0, 'Male':1}, inplace = True)
    data["Mobile_Verified"].replace({'Y': 0, 'N':1}, inplace = True)
    data["Filled_Form"].replace({'Y': 0, 'N':1}, inplace = True)
    data["Device_Type"].replace({'Web-browser': 0, 'Mobile':1}, inplace = True)
    data["DOB"] = 2015 -('19' + data["DOB"].str[-2:]).astype('int')
    
    return data

def Proces():
    num_features = ["Monthly_Income", "DOB", "Loan_Amount_Applied", "Loan_Tenure_Applied", "Existing_EMI",
                   "Loan_Amount_Submitted", "Loan_Tenure_Submitted", "Interest_Rate", "Processing_Fee",
                   "EMI_Loan_Submitted", "Var5"]
    num_pipeline = make_pipeline(SimpleImputer(strategy = "constant", fill_value = 0), 
                                 StandardScaler())
    cat_feauters = ["Var1", "Var2", "Var4", "Source"]
    cat_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"),
                                OneHotEncoder(handle_unknown="ignore"))
    other_feauters = ["Gender", "Mobile_Verified", "Filled_Form", "Device_Type"]
    other_pipeline = make_pipeline(SimpleImputer(strategy="most_frequent"))

    processor = ColumnTransformer(transformers=[
        ("num", num_pipeline, num_features),
        ("cat", cat_pipeline, cat_feauters),
        ("cabin", other_pipeline, other_feauters)
    ])
    return processor

In [15]:
X_test = Preproces(X_test)
X_train = Preproces(X_train)

In [17]:
classification_process = make_pipeline(Proces(),
                                       SelectFromModel(RandomForestClassifier(15), 0.0001),
                                       LogisticRegression(penalty ="l2", class_weight="balanced"))
classification_process.fit(X_train, y_train)
classification_process.predict(X_test)
print(precision_score(y_test, classification_process.predict_proba(X_test)[:,1]>0.7))
print(recall_score(y_test, classification_process.predict_proba(X_test)[:,1]>0.7))
print(f1_score(y_test, classification_process.predict_proba(X_test)[:,1]>0.7))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.061514195583596214
0.5098039215686274
0.10978184377199156


In [50]:
num_splits = 10

#ftwo_scorer = make_scorer(fbeta_score, beta=2)

#param_grid_rfc =  [ {
#    "rfc__n_estimators": [10, 50, 100, 150, 200, 300, 400], # number of estimators
#    "rfc__criterion": ["gini", "entropy"],   # Splitting criterion
#    "rfc__max_depth": [None, 5, 10, 15], # Max depth of the trees
#    "rfc__min_samples_split": [2, 5, 10, 15, 20, 25], # mininal samples in leafs
#    "rfc__class_weight":[None , "balanced", "balanced_subsample"], # maximum features used at each split
#    }]
#param_grid = [
#    {'classifier' : [LogisticRegression()],
#     'classifier__penalty' : ['l1', 'l2'],
#    'classifier__C' : np.logspace(-4, 4, 20),
#    'classifier__solver' : ['liblinear']},
#    {'classifier' : [RandomForestClassifier()],
#    'classifier__n_estimators' : list(range(10,101,10)),
#    'classifier__max_features' : list(range(6,32,5))}
#]
pipe = Pipeline([
    ('data_proc', Proces()),
    ('select', SelectFromModel(estimator=RandomForestClassifier())),
    ('log_reg', LogisticRegression())
    ])

param_grid =  [{
    "select__estimator__n_estimators": [#"RandomForestClassifier(5)", 
                          #"RandomForestClassifier(10)",
                          #"RandomForestClassifier(15)",
                          20], 
    "select__threshold": [0.0000001,]# 0.000001, 0.00001, 0.0001],   
    }]
kf = KFold(n_splits=num_splits)
grid = GridSearchCV(pipe, param_grid=param_grid, cv=kf, verbose=10, n_jobs=-1)

In [46]:
grid.fit(X_train, y_train)
grid.predict(X_test)
print(grid.best_params_)
print(precision_score(y_test, classification_process.predict_proba(X_test)[:,1]>0.7))
print(recall_score(y_test, classification_process.predict_proba(X_test)[:,1]>0.7))
print(f1_score(y_test, classification_process.predict_proba(X_test)[:,1]>0.7))

Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   4 out of   9 | elapsed:    5.6s remaining:    7.0s
[Parallel(n_jobs=-1)]: Done   5 out of   9 | elapsed:    5.6s remaining:    4.5s
[Parallel(n_jobs=-1)]: Done   6 out of   9 | elapsed:    5.6s remaining:    2.7s
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed:    5.6s remaining:    1.5s
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed:    5.6s remaining:    0.0s


ValueError: Invalid parameter n_estimators for estimator SelectFromModel(estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                                 class_weight=None,
                                                 criterion='gini',
                                                 max_depth=None,
                                                 max_features='auto',
                                                 max_leaf_nodes=None,
                                                 max_samples=None,
                                                 min_impurity_decrease=0.0,
                                                 min_impurity_split=None,
                                                 min_samples_leaf=1,
                                                 min_samples_split=2,
                                                 min_weight_fraction_leaf=0.0,
                                                 n_estimators=100, n_jobs=None,
                                                 oob_score=False,
                                                 random_state=None, verbose=0,
                                                 warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None). Check the list of available parameters with `estimator.get_params().keys()`.

In [51]:
estimator.get_params().keys()

NameError: name 'estimator' is not defined