In [None]:
import pandas as pd
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection

import warnings
warnings.simplefilter("ignore", UserWarning)


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/train.csv')

### CREATE CROSS-VALIDATION FOLDS:

In [None]:
def create_folds (df,split):
    #create a new column
    df["kfold"] = -1

    #randomize the rows of the data
    df = df.sample (frac = 1).reset_index(drop = True)

    # define label column
    y = df.target.values

    # initiate the kfold class
    kf = model_selection.StratifiedKFold(n_splits=split)

    # revise the kfold column
    for fold, (i , r) in enumerate(kf.split(X=df, y=y)):
        df.loc[r, 'kfold'] = int(fold)
    
    return pd.DataFrame(df)



In [None]:
def lr_model(fold):
    #define features
    features = [f for f in df.columns if f not in ('id','target', 'kfold')]

    # fill all NaN values with NONE
    for column in features:
        df.loc [:,column] = df[column].astype(str).fillna("NONE")

    # get training and validation data using folds
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    # initialize OneHotEncoder from scikit-learn
    ohe = preprocessing.OneHotEncoder()

    #fit ohe on full_data
    full_data = pd.concat( [df_train[features], df_valid[features]], axis=0 ) 
    ohe.fit(full_data[features])

    #transform training and validation data
    x_train = ohe.transform(df_train[features])
    x_valid = ohe.transform(df_valid[features])
    y_train = df_train.target.values
    y_valid = df_valid.target.values
    
    # initialize Logistic Regression model
    model = linear_model.LogisticRegression()

    # fit model on training data (ohe)
    model.fit(x_train, y_train)

    #probabilities on validation set
    y_pred = model.predict_proba(x_valid)[:, 1]

    #using metrics
    auc = metrics.roc_auc_score(y_valid, y_pred)
    print(f"for fold = {fold}, AUC is {auc}")





In [None]:
df = create_folds (df,split = 5)

In [None]:
for i in range(5):
    lr_model(i)

for fold = 0, AUC is 0.7849424469023578
for fold = 1, AUC is 0.7869124848080056
for fold = 2, AUC is 0.7877524956343462
for fold = 3, AUC is 0.7841918743271361
for fold = 4, AUC is 0.7877242160817683
