In [1]:
# Importing required Library
import numpy as np
import pandas as pd
import random
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt

from math import e

In [2]:
def create_dataset(n_sample=1000):
    ''' 
    Create a unevenly distributed sample data set multilabel  
    classification using make_classification function
    
    args
    nsample: int, Number of sample to be created
    
    return
    X: pandas.DataFrame, feature vector dataframe with 10 features 
    y: pandas.DataFrame, target vector dataframe with 5 labels
    '''
    X, y = make_classification(n_classes=5, 
                               class_sep=2, 
                               weights=[0.1,0.025, 0.205, 0.008, 0.9], 
                               n_informative=3, 
                               n_redundant=1, 
                               flip_y=0,
                               n_features=10, 
                               n_clusters_per_class=1, 
                               n_samples=1000, 
                               random_state=10)

    y = pd.get_dummies(y, prefix='class')
    
    return pd.DataFrame(X), y

In [3]:
# Creating dfs

X, y = create_dataset()

In [4]:
X.shape

(1000, 10)

In [5]:
y.shape

(1000, 5)

In [6]:
y.head(3)

Unnamed: 0,class_0,class_1,class_2,class_3,class_4
0,0,0,0,0,1
1,0,0,1,0,0
2,0,0,0,0,1


In [7]:
y[y.sum(axis=1)==1].shape[0]

# means that this is NOT multlabel ! It is multiclass


1000

In [8]:
X.loc[[46,]]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
46,1.497756,-0.35873,-1.942519,0.10701,-0.319555,-1.1641,-0.19217,-1.506867,2.739026,-2.675439


In [9]:
y.loc[:,'class_2']

0      0
1      1
2      0
3      1
4      0
      ..
995    0
996    0
997    0
998    0
999    0
Name: class_2, Length: 1000, dtype: uint8

In [10]:
lr = LogisticRegression()
lr.fit(X,y.loc[:,'class_2'])
lr.predict(X.loc[[46,]])


array([0], dtype=uint8)

In [11]:
lr.coef_.shape

(1, 10)

In [12]:
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

X = np.array([
    [10, 10],
    [8, 10],
    [-5, 5.5],
    [-5.4, 5.5],
    [-20, -20],
    [-15, -20]
])

y = np.array([0, 0, 1, 1, 2, 2])

clf = OneVsRestClassifier(SVC()).fit(X, y)
clf = OneVsRestClassifier(LogisticRegression()).fit(X, y)

clf.predict([
    [-19, -20], 
#     [9, 9], 
    [-5, 5]
])

# Output:
# array([2, 0, 1])

array([2, 1])

In [13]:
np.round(clf.coef_,2)



array([[ 0.51,  0.17],
       [-0.61,  0.59],
       [-0.15, -0.33]])

---

# Statsmodel API

In [14]:
# import statsmodels.api as sm

# X_sm = sm.add_constant(X)
# model = sm.Logit(y, X_sm)

# result = model.fit()

# print(result.summary2())

# preds20 = result.predict(X_test)

---

# Trial

## Data

In [15]:
import pandas as pd
df = pd.DataFrame({'age':[10,20,30,40,46,51], 
                   'sex':[1,1,0,1,1,0], 
                   'issue':[0,0,1,2,0,1]})
df

Unnamed: 0,age,sex,issue
0,10,1,0
1,20,1,0
2,30,0,1
3,40,1,2
4,46,1,0
5,51,0,1


In [16]:
# 2 features

X = df.drop(columns='issue')
X

Unnamed: 0,age,sex
0,10,1
1,20,1
2,30,0
3,40,1
4,46,1
5,51,0


In [17]:
# 3 labels => Multi-class (NOT multi-label)

y = df['issue']
y

0    0
1    0
2    1
3    2
4    0
5    1
Name: issue, dtype: int64

In [18]:
# X_test

X_test = [[15,1]
         ,[32,0]
           ]
X_test

[[15, 1], [32, 0]]

## Helpers

In [19]:
def get_model_attributes(model):

    model_attr = {}
    model_attr['intercept']=np.round(model.intercept_,2)
    model_attr['coefficients']=np.round(model.coef_,2)
    model_attr['classes']=model.classes_
    model_attr['predictions']=model.predict(X_test)    
    model_attr['prediction probabilities']=np.round(model.predict_proba(X_test),2)

    return model_attr

## Trial 1

Logistic Regression

In [62]:
# Logistic Regression - 1
lr1 = LogisticRegression()

# Train model
lr1.fit(X,y)

# Predict on test data
lr1.predict(X_test)

array([0, 1], dtype=int64)

In [63]:
lr1.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [64]:
# Weights: Intercept & Coefficients

lr1_attrs = get_model_attributes(lr1)
lr1_attrs

{'intercept': array([ 1.98, -0.2 , -1.78]),
 'coefficients': array([[-0.05,  0.52],
        [ 0.02, -0.83],
        [ 0.03,  0.32]]),
 'classes': array([0, 1, 2], dtype=int64),
 'predictions': array([0, 1], dtype=int64),
 'prediction probabilities': array([[0.86, 0.08, 0.06],
        [0.37, 0.51, 0.13]])}

In [65]:
# Manual check to verify predicted probability 

# Weights and features vectors : 
    # 1. 1st test sample with 1st model
w = np.array([1.98, -0.05, 0.52]) 
x = np.array([1, 15, 1]) 

np.dot(w.T,x)

# Predicted probability via formula
1 / (1+ pow(e, -1*np.dot(w.T,x)))

0.8519528019683106

In [66]:
# Manual check to verify predicted probability 

# Weights and features vectors : 
    # 1. 2nd test sample with 1st model
w = np.array([1.98, -0.05, 0.52]) 
x = np.array([1, 32, 0]) 

np.dot(w.T,x)

# Predicted probability via formula
1 / (1+ pow(e, -1*np.dot(w.T,x)))

0.5938731029341427

- `Since there are 3 classes, 3 binary classifiers are created and trained. `
    - Each model will consider 1 class as the positive class while treating all the other classes as negative. 
    - Since there are 3 classes, there are 3 models (binary classifiers), and therefore 3 intercepts (1 intercept for each model) and 3 sets of coefficients (1 set of coefficients or weights) for each of the models created.
    - **NOTE**: The coefficients are different for each binary classifier. This implies DIFFERENT WEIGHTS FOR DIFFERENT MODELS. See especially 2nd and 3rd model.

- `There will be 3 predicted probabilities for each test sample. The one with the highest prediction probability will be the prediction (class with the highest predicted probability) for that test sample.`

---

## Trial 2

Logistic Regression (specify multi-class via ovr for multiple binary classifiers)

In [67]:
# Logistic Regression - 2
lr2 = LogisticRegression(multi_class='ovr')

# Train model
lr2.fit(X,y)

# Predict on test data
lr2.predict(X_test)

array([0, 0], dtype=int64)

In [68]:
lr2.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'ovr',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [69]:
# Weights: Intercept & Coefficients

lr2_attrs = get_model_attributes(lr2)
lr2_attrs

{'intercept': array([ 2.35, -2.11, -3.7 ]),
 'coefficients': array([[-0.08,  0.6 ],
        [ 0.06, -0.92],
        [ 0.05,  0.36]]),
 'classes': array([0, 1, 2], dtype=int64),
 'predictions': array([0, 0], dtype=int64),
 'prediction probabilities': array([[0.83, 0.1 , 0.07],
        [0.44, 0.44, 0.12]])}

In [70]:
# Manual check to verify predicted probability 

# Weights and features vectors : 
    # 1st test sample with 1st model
w = np.array([2.35, -0.08, 0.6]) 
x = np.array([1, 15, 1]) 

np.dot(w.T,x)

# Predicted probability via formula
1 / (1+ pow(e, -1*np.dot(w.T,x)))

0.8519528019683106

In [71]:
# Manual check to verify predicted probability 

# Weights and features vectors : 
    # 2nd test sample with 1st model
w = np.array([2.35, -0.08, 0.6]) 
x = np.array([1, 32, 0]) 

np.dot(w.T,x)

# Predicted probability via formula
1 / (1+ pow(e, -1*np.dot(w.T,x)))

0.4476920904256748

In [73]:
# Manual check to verify predicted probability 

# Weights and features vectors : 
    # 2nd test sample with 1st model
w = np.array([-2.11, 0.06, -0.92]) 
x = np.array([1, 15, 1]) 

np.dot(w.T,x)

# Predicted probability via formula
1 / (1+ pow(e, -1*np.dot(w.T,x)))

0.10621499167517558

---

## Trial 3
Logistic Regression (Multilabel)

In [106]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score

# define dataset
X2, y2 = make_classification(n_samples=10, 
                           n_features=4, 
                           n_informative=3, 
                           n_redundant=1, 
                           n_classes=3, 
                           random_state=1)

In [107]:
X2.shape

(10, 4)

In [108]:
X2

array([[ 1.36230407,  2.01380248, -2.17993154,  1.22262599],
       [ 0.94323132,  0.24632251, -0.51593688, -0.45540049],
       [ 1.40656582, -1.11177466,  0.3208689 , -2.30163505],
       [ 1.45631093, -1.37733165, -0.78834929, -2.25868711],
       [ 0.33878658,  1.04378255,  0.27634196,  0.54701949],
       [-0.79154978, -0.97970964, -1.062138  ,  0.13174171],
       [-1.3613071 ,  1.93294701, -1.38934184,  3.29701803],
       [ 1.63913742,  1.1133852 ,  0.02325242, -0.45196643],
       [ 0.53668903,  0.73213908,  2.67528111, -0.59511928],
       [-0.20697496,  1.67010609, -0.79824429,  1.88727881]])

In [109]:
y2.shape

(10,)

In [110]:
y2

array([1, 1, 0, 2, 0, 1, 2, 1, 0, 2])

In [111]:
# define the multinomial logistic regression model
model = LogisticRegression(multi_class='multinomial', 
                           solver='lbfgs')

In [114]:
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=42)

In [115]:
model.fit(X2_train, y2_train)

LogisticRegression(multi_class='multinomial')

In [116]:
y_pred = model.predict(X2_test)
y_pred_proba = model.predict_proba(X2_test)

In [117]:
jaccard_score(y2_test, y_pred, average='weighted')

0.3333333333333333

---

In [88]:
model_ovr = LogisticRegression(multi_class='ovr', 
                               solver='lbfgs')

In [89]:
model_ovr.fit(X2_train, y2_train)

LogisticRegression(multi_class='ovr')

In [90]:
y_pred_ovr = model_ovr.predict(X2_test)
y_pred_proba_ovr = model_ovr.predict_proba(X2_test)

In [91]:
jaccard_score(y2_test, y_pred_ovr, average='weighted')

0.5582175841351034

---

# TO DO
- See if you can use formula of getting a prediction probability by using the coefficients for 1 model
- Try different combinations of X (age and sex) to see how the weights change
- Can you plot and see the decision boundary? Refer to that german video where he added a polynomial feature and then only a linear plane is able to separate the classes. Refer min:  ~16 minutes
- See whats happening with XPS wrt to above and the coefficients generated
- Once you understand all do with true multilabel


In [49]:
# Sum of predicted probabilities

sum=0
for x in lr1.predict_proba(X_test)[0]:
    sum += x
print(sum)

0.9999999999999999


In [26]:
# Recalling features again to analyze with the above coefficients

df

# When label is 0, based on coeff value, "sex" has more "weight" than age.
# When lalel is 1, based on coeff value, "sex" has more "weight" than age, though negatively correlated (is that primarily driven by the numerical value of age being 0 and target being 1 ??)
# When lalel is 2, based on coeff value, "sex" has more "weight" than age.


Unnamed: 0,age,sex,issue
0,10,1,0
1,20,1,0
2,30,0,1
3,40,1,2
4,46,1,0
5,51,0,1


In [44]:
# Binning coefficients



---

# MultiOutput Classifier - 1

- sklearn documentation example
- Random Forest


In [1]:
# Example from sklearn documentation

from sklearn.datasets import make_classification
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle
import numpy as np

X, y1 = make_classification(n_samples=10, 
                            n_features=100, 
                            n_informative=30, 
                            n_classes=3, 
                            random_state=1)

y2 = shuffle(y1, random_state=1)
y3 = shuffle(y1, random_state=2)

# Forming a multi-label target from individual ys
Y = np.vstack((y1, y2, y3)).T

n_samples, n_features = X.shape # 10,100

n_outputs = Y.shape[1] # 3

n_classes = 3

forest = RandomForestClassifier(random_state=1)

multi_target_forest = MultiOutputClassifier(forest, n_jobs=-1)

multi_target_forest.fit(X, Y).predict(X)

array([[2, 2, 0],
       [1, 2, 1],
       [2, 1, 0],
       [0, 0, 2],
       [0, 2, 1],
       [0, 0, 2],
       [1, 1, 0],
       [1, 1, 1],
       [0, 0, 2],
       [2, 0, 0]])

In [3]:
X.shape  # 10 samples with having 100 features

(10, 100)

In [7]:
print(f"y1 shape: {y1.shape}\n")

y1

y1 shape: (10,)



array([2, 1, 2, 0, 0, 0, 1, 1, 0, 2])

In [8]:
y2

array([2, 2, 1, 0, 2, 0, 1, 1, 0, 0])

In [9]:
y3

array([0, 1, 0, 2, 1, 2, 0, 1, 2, 0])

In [10]:
Y

array([[2, 2, 0],
       [1, 2, 1],
       [2, 1, 0],
       [0, 0, 2],
       [0, 2, 1],
       [0, 0, 2],
       [1, 1, 0],
       [1, 1, 1],
       [0, 0, 2],
       [2, 0, 0]])

In [16]:
multi_target_forest.get_params();

In [18]:
# 3 models because 3 targets

multi_target_forest.estimators_

[RandomForestClassifier(random_state=1),
 RandomForestClassifier(random_state=1),
 RandomForestClassifier(random_state=1)]

In [23]:
# Probability predictions

print(f"Type of structure to store probability predictions   : {type(multi_target_forest.predict_proba(X))}\n")

# 3 classes therefore 3 arrays of predicted probabilities
print(f"Length of structure to store probability predictions : {len(multi_target_forest.predict_proba(X))}\n")

# Probability Predictions
y_pred_proba = multi_target_forest.predict_proba(X)

# 1st array of predicted probabilities
y_pred_proba[0]


Type of structure to store probability predictions   : <class 'list'>

Length of structure to store probability predictions : 3



array([[0.15, 0.12, 0.73],
       [0.21, 0.67, 0.12],
       [0.18, 0.09, 0.73],
       [0.81, 0.09, 0.1 ],
       [0.74, 0.13, 0.13],
       [0.8 , 0.13, 0.07],
       [0.12, 0.72, 0.16],
       [0.22, 0.68, 0.1 ],
       [0.82, 0.08, 0.1 ],
       [0.15, 0.11, 0.74]])

In [26]:
# For the probabilities of the 1st model, which column has the highest probabilities

np.argmax(y_pred_proba[0], axis=1)

array([2, 1, 2, 0, 0, 0, 1, 1, 0, 2], dtype=int64)

In [22]:
y_pred = multi_target_forest.fit(X, Y).predict(X)
y_pred

array([[2, 2, 0],
       [1, 2, 1],
       [2, 1, 0],
       [0, 0, 2],
       [0, 2, 1],
       [0, 0, 2],
       [1, 1, 0],
       [1, 1, 1],
       [0, 0, 2],
       [2, 0, 0]])

In [28]:
# 1st column of predictions relates to the 1st model's probability predictions.
# and therefore these numbers match up.

y_pred[:,0]

array([2, 1, 2, 0, 0, 0, 1, 1, 0, 2])

---

# MultiOutput Classifier - 2

- Adapting from sklearn documentation example
- Logistic Regression (default settings)


In [30]:
from sklearn.linear_model import LogisticRegression

lr1 = LogisticRegression()

# MultiOutput with Lr-1
mo_lr1 = MultiOutputClassifier(estimator=lr1)

# Fit and predict
y_pred_lr1 = mo_lr1.fit(X, Y).predict(X)

# Probability Predictions
y_pred_proba_lr1 = mo_lr1.predict_proba(X)


In [31]:
# Probability predictions

print(f"Type of structure to store probability predictions   : {type(y_pred_proba_lr1)}\n")

# 3 classes therefore 3 arrays of predicted probabilities
print(f"Length of structure to store probability predictions : {len(y_pred_proba_lr1)}\n")

# 1st array of predicted probabilities
y_pred_proba_lr1[0]

Type of structure to store probability predictions   : <class 'list'>

Length of structure to store probability predictions : 3



array([[2.42747247e-03, 3.17854037e-03, 9.94393987e-01],
       [6.98858593e-03, 9.87321277e-01, 5.69013676e-03],
       [9.31437941e-03, 2.63085939e-03, 9.88054761e-01],
       [9.90612437e-01, 4.25344039e-03, 5.13412277e-03],
       [9.88007896e-01, 5.55111691e-03, 6.44098673e-03],
       [9.95577877e-01, 3.47211296e-03, 9.50010001e-04],
       [3.50574185e-03, 9.94162309e-01, 2.33194914e-03],
       [6.54924927e-03, 9.91682384e-01, 1.76836662e-03],
       [9.95753674e-01, 3.54851849e-03, 6.97807021e-04],
       [1.35258228e-03, 4.10259679e-03, 9.94544821e-01]])

In [33]:
# For the probabilities of the 1st model, which column has the highest probabilities

np.argmax(y_pred_proba_lr1[0], axis=1)

array([2, 1, 2, 0, 0, 0, 1, 1, 0, 2], dtype=int64)

In [32]:
y_pred_lr1

array([[2, 2, 0],
       [1, 2, 1],
       [2, 1, 0],
       [0, 0, 2],
       [0, 2, 1],
       [0, 0, 2],
       [1, 1, 0],
       [1, 1, 1],
       [0, 0, 2],
       [2, 0, 0]])

---

# MultiOutput Classifier - 3

- Adapting from sklearn documentation example
- Logistic Regression (setting multi_class="ovr)

In [35]:
from sklearn.linear_model import LogisticRegression

lr2 = LogisticRegression()

# MultiOutput with Lr-2
mo_lr2 = MultiOutputClassifier(estimator=lr2)

# Fit and predict
y_pred_lr2 = mo_lr2.fit(X, Y).predict(X)

# Probability Predictions
y_pred_proba_lr2 = mo_lr2.predict_proba(X)


In [36]:
# Probability predictions

print(f"Type of structure to store probability predictions   : {type(y_pred_proba_lr2)}\n")

# 3 classes therefore 3 arrays of predicted probabilities
print(f"Length of structure to store probability predictions : {len(y_pred_proba_lr2)}\n")

# 1st array of predicted probabilities
y_pred_proba_lr2[0]

Type of structure to store probability predictions   : <class 'list'>

Length of structure to store probability predictions : 3



array([[2.42747247e-03, 3.17854037e-03, 9.94393987e-01],
       [6.98858593e-03, 9.87321277e-01, 5.69013676e-03],
       [9.31437941e-03, 2.63085939e-03, 9.88054761e-01],
       [9.90612437e-01, 4.25344039e-03, 5.13412277e-03],
       [9.88007896e-01, 5.55111691e-03, 6.44098673e-03],
       [9.95577877e-01, 3.47211296e-03, 9.50010001e-04],
       [3.50574185e-03, 9.94162309e-01, 2.33194914e-03],
       [6.54924927e-03, 9.91682384e-01, 1.76836662e-03],
       [9.95753674e-01, 3.54851849e-03, 6.97807021e-04],
       [1.35258228e-03, 4.10259679e-03, 9.94544821e-01]])

In [37]:
# For the probabilities of the 1st model, which column has the highest probabilities

np.argmax(y_pred_proba_lr2[0], axis=1)

array([2, 1, 2, 0, 0, 0, 1, 1, 0, 2], dtype=int64)

---

# OneVsRest Classifier - 4

- OneVsRest Classifier
- Logistic Regression (default setting)

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

lr3 = LogisticRegression()

# Ovr with Lr-3
ovr_lr3 = OneVsRestClassifier(estimator=lr3)

# Fit and predict
y_pred_lr3 = ovr_lr3.fit(X, Y).predict(X)

# Probability Predictions
y_pred_proba_lr3 = ovr_lr3.predict_proba(X)


ValueError: Multioutput target data is not supported with label binarization

In [40]:
Y

array([[2, 2, 0],
       [1, 2, 1],
       [2, 1, 0],
       [0, 0, 2],
       [0, 2, 1],
       [0, 0, 2],
       [1, 1, 0],
       [1, 1, 1],
       [0, 0, 2],
       [2, 0, 0]])

In [47]:
Y_tmp = np.where(Y==2, 1, Y)
Y_tmp

array([[1, 1, 0],
       [1, 1, 1],
       [1, 1, 0],
       [0, 0, 1],
       [0, 1, 1],
       [0, 0, 1],
       [1, 1, 0],
       [1, 1, 1],
       [0, 0, 1],
       [1, 0, 0]])

In [48]:
##### Replacing Y with Y_tmp

from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

lr3 = LogisticRegression()

# Ovr with Lr-3
ovr_lr3 = OneVsRestClassifier(estimator=lr3)

# Fit and predict
y_pred_lr3 = ovr_lr3.fit(X, Y_tmp).predict(X)

# Probability Predictions
y_pred_proba_lr3 = ovr_lr3.predict_proba(X)

---

# OneVsRest Classifier - 5

- OneVsRest Classifier
- Logistic Regression (setting multi_class="ovr)

---

# OneVsRest Classifier within Logistic Regression - 6

- Logistic Regression (setting multi_class="ovr)