# Creating ML Algorithms Utilizing Logistic, KNN-means, and Random-Forest Regression Methods
## Arbaz Khan

In [110]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv("Loan_Train.csv")



# Get object
col = df.columns[df.dtypes=='object']

col = col.delete(7)

df2 = pd.get_dummies(df, columns = col) 

# Split into train and test data on SalePrice

from sklearn.model_selection import train_test_split

y = df2['Loan_Status']
x = df2.drop('Loan_Status', axis=1)
x = x.fillna(0)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)

In [111]:
col

Index(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education',
       'Self_Employed', 'Property_Area'],
      dtype='object')

In [112]:
knn = KNeighborsClassifier(n_neighbors = 5)

scaler = MinMaxScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

In [113]:
# Fit using knn neighbors classifier with n = 5
# Use ravel to fit dimensions of y
knn.fit(x_train_scaled, y_train.values.ravel())

accuracy_score(y_test, knn.predict(x_test_scaled))

0.7133550488599348

In [114]:
from sklearn import datasets
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler

standardizer = StandardScaler()

# Create a pipeline using StandardScaler and our knn classifier
pipe = Pipeline([("standardizer", standardizer), ("knn", knn)])

# Search for best value of  n between 1-10
search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

target = y_train.values.ravel()
features = x_train_scaled

classifier = GridSearchCV(pipe, search_space, cv = 5, verbose = 0).fit(features, target)

classifier.best_estimator_.get_params()["knn__n_neighbors"]

8

In [115]:
# Test accuracy of knn with n = 8 
knn8 = KNeighborsClassifier(n_neighbors = 8)
knn8.fit(features, target)
accuracy_score(y_test, knn8.predict(x_test_scaled))

0.7133550488599348

In [107]:
from sklearn import datasets
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

knn = KNeighborsClassifier(n_neighbors = 8)

scaler = StandardScaler()

x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.fit_transform(x_test)

np.random.seed(0)

pipe = Pipeline([("classifier", RandomForestClassifier())])

# Create a search space for knn, logistic regression, and randomforest
# classifiers, with the use of hyperparameters
search_space = [{"classifier":[knn]},
               {"classifier":[LogisticRegression(max_iter = 500, solver = 'lbfgs')],
                "classifier__penalty": ['l2'],
                "classifier__C": np.logspace(0, 6, 10)},
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [8],
                 "classifier__max_features":[1, 2, 3]}]

target = y_train.values.ravel()
features = x_train_scaled

# Perform grid search over these regression methods and fit 
classifier = GridSearchCV(pipe, search_space, cv = 5, verbose = 0).fit(features, target)



In [108]:
print(classifier.best_estimator_)

Pipeline(steps=[('classifier',
                 LogisticRegression(C=21.544346900318832, max_iter=500))])


In [109]:
# Logistic regression with an l1 penalty was found to be our best method
logreg = LogisticRegression(C=21.544346900318832, max_iter = 500, penalty = 'l2', solver = 'lbfgs')

logreg.fit(features, target)
accuracy_score(y_test, logreg.predict(x_test_scaled))

0.7068403908794788

From the results, we can see that utilizing logistic regression with C = 21.544 was our most accurate method, and the accuracy found was around 70.7%. With that said, earlier using the KNN means classifier with 8 neighbors resulted in the highest accuracy seen throughout, which was ~ 71.3%. 