### Purpose
Complete a model with high generalization performance

In [2]:
# problem 1
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score

# loading the csv of the dataset
df = pd.read_csv('application_train.csv')
df = df.select_dtypes('number')

# cleaning the dataset by filling the empy data(null)
cleaned_df = df.fillna(0)

# separating them into variables
y = cleaned_df['TARGET']
X = cleaned_df.drop(['TARGET'], axis=1)

X = X.to_numpy()

kf = KFold(n_splits=2)

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

TRAIN: [153756 153757 153758 ... 307508 307509 307510] TEST: [     0      1      2 ... 153753 153754 153755]
TRAIN: [     0      1      2 ... 153753 153754 153755] TEST: [153756 153757 153758 ... 307508 307509 307510]


I was able to split the dataset into training and testing subsets using kfold which a more better version than using Train_test_split

In [6]:
# standardizing the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_trans = scaler.transform(X_train)
X_test_trans = scaler.transform(X_test)

# problem 2, using gridsearch

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# checking which model and params are best
model_params = {
    'random_forest':{
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [1,5,10]
        }
    },
    'logic_regression':{
        'model': LogisticRegression(solver="liblinear",multi_class="auto"),
        'params': {
            'C': [1,5,10]
        }
    }
}

# defining an array to store the scores
scores = []

for model_name,mp in model_params.items():
    clf = GridSearchCV(mp['model'],mp['params'], return_train_score=False)
    clf.fit(X_train_trans,y_train)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })

best_model_params = pd.DataFrame(scores,columns=['model','best_score','best_params'])
best_model_params

Unnamed: 0,model,best_score,best_params
0,random_forest,0.91586,{'n_estimators': 10}
1,logic_regression,0.918364,{'C': 1}


After running the grid search, i can see that the results are almost the same for both models but logic_regression has the best score

In [None]:
# problem 3

