# Let's take on this challenge of new beinnings!

## Importing the Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import accuracy_score,hamming_loss
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

## Now we add the dataset

In [2]:
path = "DataMLC.csv"

In [3]:
dataset = pd.read_csv(path)
dataset.head()

Unnamed: 0,Test_age_T1,Body_height_cm_T1,Sitting_height_cm_T1,Arm_span_cm_T1,Arm_length_cm_T1,Leg_length_cm_T1,Chest_girth_cm_T1,Shoulder_width_cm_T1,Crista_width_cm_T1,Calf_girth_cm_T1,...,Eye_hand_reaction_time,Heart_rate_rest_bpm_T1,Vital_capacity_ml_T1,Sport,basketball,fencing,judo,swim,table tennis,volleyball
0,172,181.8,91.8,184.0,78.4,97.3,79.2,36.2,27.7,33.0,...,202,69,4315,basketball,1,0,0,0,0,0
1,172,170.7,91.3,169.8,70.2,88.2,81.1,37.1,28.3,35.5,...,234,62,3555,basketball,1,0,0,0,0,0
2,178,183.0,94.1,181.4,76.8,95.8,80.0,39.4,28.2,34.8,...,233,76,4450,basketball,1,0,0,0,0,0
3,178,184.2,93.0,188.8,82.4,99.6,78.3,36.4,27.9,33.1,...,201,75,4045,basketball,1,0,0,0,0,0
4,184,184.7,96.6,183.5,78.7,97.0,91.8,37.7,26.8,34.6,...,190,60,4145,basketball,1,0,0,0,0,0


In [4]:
dataset.dtypes

Test_age_T1                    int64
Body_height_cm_T1            float64
Sitting_height_cm_T1         float64
Arm_span_cm_T1               float64
Arm_length_cm_T1             float64
Leg_length_cm_T1             float64
Chest_girth_cm_T1            float64
Shoulder_width_cm_T1         float64
Crista_width_cm_T1           float64
Calf_girth_cm_T1             float64
Thigh_circumference_cm_T1    float64
Waist_girth_cm_T1            float64
LowerLeg_length_cm_T1        float64
Ankle_circumference_cm_T1    float64
Subscapular_angle_mm_T1      float64
Abdomen_mm_T1                float64
Upper_arm_mm_T1              float64
Tendon_length_cm_T1          float64
Body_weight_kg_T1            float64
Eye_hand_reaction_time         int64
Heart_rate_rest_bpm_T1         int64
Vital_capacity_ml_T1           int64
Sport                         object
basketball                     int64
fencing                        int64
judo                           int64
swim                           int64
t

## Data Preprocessing

In [5]:
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
le = LabelEncoder()
dataset["Sport"] = le.fit_transform(dataset["Sport"])
dataset.dtypes

Test_age_T1                    int64
Body_height_cm_T1            float64
Sitting_height_cm_T1         float64
Arm_span_cm_T1               float64
Arm_length_cm_T1             float64
Leg_length_cm_T1             float64
Chest_girth_cm_T1            float64
Shoulder_width_cm_T1         float64
Crista_width_cm_T1           float64
Calf_girth_cm_T1             float64
Thigh_circumference_cm_T1    float64
Waist_girth_cm_T1            float64
LowerLeg_length_cm_T1        float64
Ankle_circumference_cm_T1    float64
Subscapular_angle_mm_T1      float64
Abdomen_mm_T1                float64
Upper_arm_mm_T1              float64
Tendon_length_cm_T1          float64
Body_weight_kg_T1            float64
Eye_hand_reaction_time         int64
Heart_rate_rest_bpm_T1         int64
Vital_capacity_ml_T1           int64
Sport                          int32
basketball                     int64
fencing                        int64
judo                           int64
swim                           int64
t

In [6]:
z = dataset.iloc[:, 18].values
X = dataset.iloc[:, [1,18,4,5,6]].values
y = dataset.iloc[:, [23,24,25,26,27,28]].values

## Divide the data into train and test datasets into 70% training and 30% testing

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 17)

# Multi-label classification Model

### Binary Relevance Technique

#### Naive Bayes 

In [8]:
from sklearn.linear_model import LogisticRegression

# initialize binary relevance multi-label classifier
# with a gaussian naive bayes base classifier
classifier = BinaryRelevance(GaussianNB())
# train
classifier.fit(X_train, y_train)
# predict
predictions = classifier.predict(X_test)
# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))

Accuracy =  0.32142857142857145


#### Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression
binary_rel_clf = BinaryRelevance(LogisticRegression())
binary_rel_clf.fit(X_train,y_train)
BinaryRelevance(classifier=LogisticRegression(),
                require_dense=[True, True])
br_prediction = binary_rel_clf.predict(X_test)
accuracy_score(y_test,br_prediction)

0.35714285714285715

#### KNN 71%

In [10]:

from sklearn.neighbors import KNeighborsClassifier
binary_rel_clf = BinaryRelevance(KNeighborsClassifier(n_neighbors = 1))
binary_rel_clf.fit(X_train,y_train)
BinaryRelevance(classifier=KNeighborsClassifier(),require_dense=[True, True])
br_prediction = binary_rel_clf.predict(X_test)
accuracy_score(y_test,br_prediction)


0.7142857142857143

## Classifier Chains Technique 20%

In [11]:
def build_model(model,mlb_estimator,xtrain,ytrain,xtest,ytest):
    clf = mlb_estimator(model)
    clf.fit(xtrain,ytrain)
    clf_predictions = clf.predict(xtest)
    acc = accuracy_score(ytest,clf_predictions)
    ham = hamming_loss(ytest,clf_predictions)
    result = {"accuracy:":acc,"hamming_score":ham, "predictions":clf_predictions.toarray()}
    return result

In [12]:
clf_chain_model = build_model(KNeighborsClassifier(n_neighbors = 6),ClassifierChain,X_train,y_train,X_test,y_test)
clf_chain_model


{'accuracy:': 0.25,
 'hamming_score': 0.14285714285714285,
 'predictions': array([[0., 0., 0., 0., 1., 0.],
        [0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.],
        [1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 1.],
        [0., 1., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0.],
        

### Labelpowerset Technique 60%

In [13]:
clf_labelP_model = build_model(MultinomialNB(),LabelPowerset,X_train,y_train,X_test,y_test)


In [14]:
clf_labelP_model


{'accuracy:': 0.5,
 'hamming_score': 0.16666666666666666,
 'predictions': array([[0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 1],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0],
        [0, 1, 0, 0, 0, 0]], dtype=int64)}

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)