<a href="https://colab.research.google.com/github/alleyex/nodejs-course-notes-app/blob/master/variable_magnitude_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# import several machine learning algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

# to scale the features
from sklearn.preprocessing import MinMaxScaler

# to evaluate performance and separate into
# train and test set
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

In [3]:
import os

files = ["titanic.csv","loan.csv"]
for file in files:
  if not os.path.exists(file):
    !wget "https://raw.githubusercontent.com/solegalli/feature-engineering-for-machine-learning/main/Datasets/{file}"
    print(f"Downloaded {file}")
  else:
    print(f"{file} already exists")

--2024-06-11 07:20:40--  https://raw.githubusercontent.com/solegalli/feature-engineering-for-machine-learning/main/Datasets/titanic.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104076 (102K) [text/plain]
Saving to: ‘titanic.csv’


2024-06-11 07:20:40 (4.32 MB/s) - ‘titanic.csv’ saved [104076/104076]

Downloaded titanic.csv
--2024-06-11 07:20:40--  https://raw.githubusercontent.com/solegalli/feature-engineering-for-machine-learning/main/Datasets/loan.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1059576 (1.0M) [text/pl

In [4]:
# Load numerical variables of the Titanic Dataset

data = pd.read_csv('titanic.csv',
                   usecols=['pclass', 'age', 'fare', 'survived'])
data.head()

Unnamed: 0,pclass,survived,age,fare
0,1,1,29.0,211.3375
1,1,1,0.9167,151.55
2,1,0,2.0,151.55
3,1,0,30.0,151.55
4,1,0,25.0,151.55


In [5]:
# Let's have a look at the variables' values and
# compare the feature magnitudes.

data.describe()

Unnamed: 0,pclass,survived,age,fare
count,1309.0,1309.0,1046.0,1308.0
mean,2.294882,0.381971,29.881135,33.295479
std,0.837836,0.486055,14.4135,51.758668
min,1.0,0.0,0.1667,0.0
25%,2.0,0.0,21.0,7.8958
50%,3.0,0.0,28.0,14.4542
75%,3.0,1.0,39.0,31.275
max,3.0,1.0,80.0,512.3292


In [6]:
# Let's calculate the range.

for col in ['pclass', 'age', 'fare']:
    print(col, 'range: ', data[col].max() - data[col].min())

pclass range:  2
age range:  79.8333
fare range:  512.3292


In [7]:
# Let's separate the data into training and testing sets.

# The titanic dataset contains missing information.
# For this demo, I will fill in those values with 0s.

X_train, X_test, y_train, y_test = train_test_split(
    data[['pclass', 'age', 'fare']].fillna(0),
    data.survived,
    test_size=0.3,
    random_state=0)

X_train.shape, X_test.shape

((916, 3), (393, 3))

In [9]:
# Scale the features between 0 and 1.

# The scaler.
scaler = MinMaxScaler()

# Fit the scaler.
scaler.fit(X_train)

# Re-scale the datasets.
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [10]:
# Let's have a look at the scaled training set.

print('Mean: ', X_train_scaled.mean(axis=0))
print('Standard Deviation: ', X_train_scaled.std(axis=0))
print('Minimum value: ', X_train_scaled.min(axis=0))
print('Maximum value: ', X_train_scaled.max(axis=0))

Mean:  [0.64628821 0.33048359 0.06349833]
Standard Deviation:  [0.42105785 0.23332045 0.09250036]
Minimum value:  [0. 0. 0.]
Maximum value:  [1. 1. 1.]


In [11]:
# Model trained with unscaled variables.

# The model.
logit = LogisticRegression(
    random_state=44,
    C=1000,  # c big to avoid regularization
    solver='lbfgs')

# Train the model.
logit.fit(X_train, y_train)

# Evaluate performance.
print('Train set')
pred = logit.predict_proba(X_train)
print('Logistic Regression roc-auc: {}'.format(
    roc_auc_score(y_train, pred[:, 1])))
print('Test set')
pred = logit.predict_proba(X_test)
print('Logistic Regression roc-auc: {}'.format(
    roc_auc_score(y_test, pred[:, 1])))

Train set
Logistic Regression roc-auc: 0.6793181006244372
Test set
Logistic Regression roc-auc: 0.7175488081411426


In [13]:
# Let's look at the coefficients.
logit.coef_

array([[-1.42875872, -0.68293349,  2.17646757]])

In [14]:
# Model trained with scaled variables.

# The model.
logit = LogisticRegression(
    random_state=44,
    C=1000,  # c big to avoid regularization
    solver='lbfgs')

# Train the model using the re-scaled data.
logit.fit(X_train_scaled, y_train)

# Evaluate performance.
print('Train set')
pred = logit.predict_proba(X_train_scaled)
print('Logistic Regression roc-auc: {}'.format(
    roc_auc_score(y_train, pred[:, 1])))
print('Test set')
pred = logit.predict_proba(X_test_scaled)
print('Logistic Regression roc-auc: {}'.format(
    roc_auc_score(y_test, pred[:, 1])))

Train set
Logistic Regression roc-auc: 0.6793281640744896
Test set
Logistic Regression roc-auc: 0.7175488081411426


In [15]:
# Let's look at the coefficients.

logit.coef_

array([[-1.42875872, -0.68293349,  2.17646757]])

In [17]:
#  Model trained unscaled variables.

# The model.
SVM_model = SVC(random_state=44, probability=True, gamma='auto')

# Train the model.
SVM_model.fit(X_train, y_train)

# Evaluate performance.
print('Train set')
pred = SVM_model.predict_proba(X_train)
print('SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))
print('Test set')
pred = SVM_model.predict_proba(X_test)
print('SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

Train set
SVM roc-auc: 0.882393490960506
Test set
SVM roc-auc: 0.6617581992146452


In [18]:
# Model trained with scaled variables.

# The model.
SVM_model = SVC(random_state=44, probability=True, gamma='auto')

# Train the model.
SVM_model.fit(X_train_scaled, y_train)

# Evaluate performance.
print('Train set')
pred = SVM_model.predict_proba(X_train_scaled)
print('SVM roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))
print('Test set')
pred = SVM_model.predict_proba(X_test_scaled)
print('SVM roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

Train set
SVM roc-auc: 0.6780802962679695
Test set
SVM roc-auc: 0.6841435761296388


In [19]:
# Model trained with unscaled features.

# The model.
KNN = KNeighborsClassifier(n_neighbors=5)

# Train the model.
KNN.fit(X_train, y_train)

# Evaluate performance.
print('Train set')
pred = KNN.predict_proba(X_train)
print('KNN roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = KNN.predict_proba(X_test)
print('KNN roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
KNN roc-auc: 0.8087718062383327
Test set
KNN roc-auc: 0.6764697749018307


In [20]:
# Model trained with scaled data.

# The model.
KNN = KNeighborsClassifier(n_neighbors=5)

# Train the model.
KNN.fit(X_train_scaled, y_train)

# Evaluate performance.
print('Train set')
pred = KNN.predict_proba(X_train_scaled)
print('KNN roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = KNN.predict_proba(X_test_scaled)
print('KNN roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
KNN roc-auc: 0.8276281957743572
Test set
KNN roc-auc: 0.721033128698634


In [21]:
# Model trained with unscaled features.

# The model.
rf = RandomForestClassifier(n_estimators=200, random_state=39)

# Train the model.
rf.fit(X_train, y_train)

# Evaluate performance.
print('Train set')
pred = rf.predict_proba(X_train)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:, 1])))
print('Test set')
pred = rf.predict_proba(X_test)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:, 1])))

Train set
Random Forests roc-auc: 0.9866810238554083
Test set
Random Forests roc-auc: 0.7326751838946961


In [22]:
# Model trained with  scaled features

# The model.
rf = RandomForestClassifier(n_estimators=200, random_state=39)

# Train the model.
rf.fit(X_train_scaled, y_train)

# Evaluate performance.
print('Train set')
pred = rf.predict_proba(X_train_scaled)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = rf.predict_proba(X_test_scaled)
print('Random Forests roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
Random Forests roc-auc: 0.9867917218059866
Test set
Random Forests roc-auc: 0.7312510370001659


In [23]:
# Train Adaboost on non-scaled features.

# Adaboost
ada = AdaBoostClassifier(n_estimators=200, random_state=44)

# Train the model.
ada.fit(X_train, y_train)

# Evaluate model performance.
print('Train set')
pred = ada.predict_proba(X_train)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = ada.predict_proba(X_test)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
AdaBoost roc-auc: 0.7970629821021541
Test set
AdaBoost roc-auc: 0.7473867595818815


In [24]:
# Train Adaboost on scaled features.

# Adaboost.
ada = AdaBoostClassifier(n_estimators=200, random_state=44)

# Train the model.
ada.fit(X_train_scaled, y_train)

# Evaluate model performance.
print('Train set')
pred = ada.predict_proba(X_train_scaled)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_train, pred[:,1])))
print('Test set')
pred = ada.predict_proba(X_test_scaled)
print('AdaBoost roc-auc: {}'.format(roc_auc_score(y_test, pred[:,1])))

Train set
AdaBoost roc-auc: 0.7970629821021541
Test set
AdaBoost roc-auc: 0.7475250262706707
