In [65]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [20]:
df = pd.read_csv('cleaned_dataset.csv')
df.head()

Unnamed: 0,Age,Gender,Years at Address,Employment Status,Country,Current Debt,Postcode,Income,Housing,CCJs,Loan Amount,Outcome
0,19,M,2,Unemployed,UK,0.0,TA3 7SH,45500.0,Rent,1,13234.0,Paid
1,66,F,13,Unemployed,UK,1080.0,WV6 8SU,21000.0,Own,0,5561.0,Paid
2,48,F,4,Self Employed,UK,3690.0,BT15 5HG,47500.0,Rent,1,28288.0,Paid
3,67,F,42,Self Employed,UK,6560.0,GU10 3NH,36000.0,Mortgage,0,30199.0,Paid
4,70,M,8,Self Employed,UK,9100.0,GL6 6UB,50500.0,Own,0,35078.0,Paid


In [49]:
X = df.drop(['Outcome', 'Postcode'], axis=1)
y = df['Outcome']

In [79]:
y = [1 if x == 'Paid' else 0 for x in y]

In [80]:
categorical_features = []
numerical_features = []

for feature in X.columns:
    if df[feature].dtype in ['object', 'category']:
        categorical_features.append(feature)
    else:
        numerical_features.append(feature)

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [51]:
numerical_features

['Age', 'Years at Address', 'Current Debt', 'Income', 'CCJs', 'Loan Amount']

In [52]:
categorical_features

['Gender', 'Employment Status', 'Country', 'Housing']

In [31]:
scaler = StandardScaler().fit(X[numerical_features])

In [39]:
X_scaled = scaler.transform(X[numerical_features])
X_scaled

array([[-1.61593616e+00, -1.10240875e+00, -1.11055511e+00,
         6.61928397e-01, -4.64940344e-04, -4.43240351e-01],
       [ 6.23654530e-01, -2.92328504e-01, -7.48124850e-01,
        -1.67653530e+00, -9.30345629e-01, -1.04036213e+00],
       [-2.34061052e-01, -9.55121434e-01,  1.27748277e-01,
         8.52823393e-01, -4.64940344e-04,  7.28279423e-01],
       ...,
       [-9.01173172e-01, -8.81477775e-01,  1.08080563e+00,
         2.80138406e-01,  9.29415748e-01, -9.38182942e-01],
       [-4.34575895e-02, -1.10240875e+00, -7.98462386e-01,
        -5.78889074e-01, -9.30345629e-01, -1.05195748e+00],
       [-5.19966246e-01, -1.02876509e+00, -1.10719927e+00,
         9.95994640e-01, -9.30345629e-01, -1.17468145e+00]])

In [40]:
X_scaled.mean(axis=0)

array([3.55271368e-17, 7.10542736e-18, 6.75015599e-17, 1.77635684e-17,
       3.55271368e-17, 1.03028697e-16])

In [41]:
X_scaled.std(axis=0)

array([1., 1., 1., 1., 1., 1.])

In [42]:
X_scaled.shape

(2000, 6)

In [46]:
y = np.array(y)

In [78]:
y.shape

AttributeError: 'list' object has no attribute 'shape'

In [53]:
enc = OneHotEncoder().fit(X[categorical_features])

In [58]:
X_encoded = enc.transform(X[categorical_features]).toarray()

In [59]:
X_encoded

array([[0., 1., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.],
       ...,
       [0., 1., 0., ..., 0., 1., 0.],
       [1., 0., 1., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 1., 0.]])

In [60]:
X_encoded.shape

(2000, 13)

In [66]:
X_transformed = np.concatenate((X_scaled, X_encoded), axis=1)
X.shape

(2000, 19)

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=1234)

In [67]:
cfl = RandomForestClassifier(n_estimators=20, max_depth=None, min_samples_split=2, random_state=0)

In [68]:
cfl.fit(X_train, y_train)

RandomForestClassifier(n_estimators=20, random_state=0)

In [69]:
pred = cfl.predict(X_test)