In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/playground-series-s3e13/train.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.prognosis.nunique()

In [None]:
train.describe()

### Observation
1. There are 707 rows nd 66 columns. 
2. No null value is present in the data.
3. Except id and prognosis column every other column is float datatype. 
4. Target is multiclass with 11 vector borne diseases. 
5. All the float column values are of binary type, 0.0 or 1.0.

## EDA

In [None]:
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
fig, axes = plt.subplots(figsize=(20, 15))
mask = np.zeros_like(train.corr())
mask[np.triu_indices_from(mask)] = True
sns.heatmap(train.corr(),mask=mask, linewidths=.5, cmap='Reds', annot=False)

### Observation
1. Shows most of the columns are not correlated to each other. 

In [None]:
labels = train.prognosis.unique()
sizes = train.groupby('prognosis').count()['id']
explode = [0.1] * 11
plt.pie(sizes, labels=labels,
        autopct='%1.1f%%', pctdistance=0.85, explode = explode)
circle = plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(circle);

### Observation
1. Kinda balanced dataset.

As there are 66 column. It will be really lengthy and time consuming to do analysis on each feature. 
So before diving into modelling, lets do dimensionallity reduction. 

## Dimenisonality Reduction

Here I am using PCA for dimensionality reduction. Instead of mentioning number of PCA components. I have mentioned the variance need to be preserved as 0.80 (A rule of thumb is to keep at least 70-80% of the explained variance).

In [None]:
X = train.drop(['id','prognosis'],axis=1)
y = train['prognosis']

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.80)
pca.fit(X)
data_pca = pca.transform(X)
data_pca.shape

This result shows cols are highly non correlated and 50 cols accounts for 0.95 variance out of 66 columns. 

In [None]:
X = pd.DataFrame(data_pca,columns=['pca'+ str(i) for i in range(1, 33, 1)])
X

In [None]:
fig, axes = plt.subplots(figsize=(20, 15))
mask = np.zeros_like(X.corr())
mask[np.triu_indices_from(mask)] = True
sns.heatmap(X.corr(),mask=mask, linewidths=.5, cmap='Reds', annot=False)

## Data Preparation

In [None]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
y = y.values.reshape((-1,1))
y = encoder.fit_transform(y)

## Modelling

Since we are not sure which model to use. Lets try all the models we are familiar and chose the best out of it. Here I am not doing any otimization techniques, just a basic version of all models.

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
seed = 0
splits = 5
k = StratifiedKFold(n_splits = splits, random_state = seed, shuffle = True)

In [None]:
def Model_selection(model,X, y, cv = k, label = ''):
    
    train_roc_auc_score, val_roc_auc_score = [], []
    train_map3, val_map3 = [], []
    
    #training model, predicting prognosis probability, and evaluating log loss
    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
                
        model.fit(X.iloc[train_idx], y[train_idx])
        
        train_preds = model.predict_proba(X.iloc[train_idx])
        val_preds = model.predict_proba(X.iloc[val_idx])
        
        train_score = roc_auc_score(y[train_idx], train_preds, multi_class='ovr')
        val_score = roc_auc_score(y[val_idx], val_preds, multi_class='ovr')
        
        train_roc_auc_score.append(train_score)
        val_roc_auc_score.append(val_score)
        
        #select three most probable prognosis based on train dataset prediction
        train_index = np.argsort(-train_preds)[:,:3] #return index of three most probable prognosis
        
        #select three most probable prognosis based on validation dataset prediction
        val_index = np.argsort(-val_preds)[:,:3]
    
        #calculate map@3
        train_score = mapk(y[train_idx].reshape(-1, 1), train_index, 3)
        val_score = mapk(y[val_idx].reshape(-1, 1), val_index, 3)
        
        train_map3.append(train_score)
        val_map3.append(val_score)
    
    print(f'Val roc_auc_score   : {np.mean(val_roc_auc_score):.5f} | Train roc_auc_score   : {np.mean(train_roc_auc_score):.5f} | {label}')
    print(f'Val MAP@3 Score: {np.mean(val_map3):.5f} | Train MAP@3 Score: {np.mean(train_map3):.5f} | {label}\n')
    
    return val_roc_auc_score, val_map3

In [None]:
# Sourced from the ml_metrics package at https://github.com/benhamner/Metrics/blob/master/Python/ml_metrics/average_precision.py
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]
        
    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
                 predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
logloss_list, map3_list = pd.DataFrame(), pd.DataFrame()

models = [
    ('log', LogisticRegression(random_state = seed, max_iter = 1000)),
    ('svc', SVC(random_state = seed, probability = True)),
    ('gauss', GaussianProcessClassifier(random_state = seed)),
    ('rf', RandomForestClassifier(random_state = seed)),
    ('xgb', XGBClassifier(random_state = seed, objective = 'multi:softprob', eval_metric = 'map@3')),
    ('lgb', LGBMClassifier(random_state = seed, objective = 'softmax', metric = 'softmax')),
    ('gb', GradientBoostingClassifier(random_state = seed)),
    ('ada', AdaBoostClassifier(random_state = seed)),
    ('knn', KNeighborsClassifier())
]

for (label, model) in models:
    (logloss_list[label], map3_list[label]) = Model_selection(model, label = label, X= X, y=y.ravel())

Choosing SVC model.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

## Tuning the model

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01,0.001],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVC(random_state = seed, probability = True),param_grid,refit=True,verbose=False)
grid.fit(X,y.ravel())

print(grid.best_estimator_)

In [None]:
svc = SVC(random_state = seed, probability = True,C=1, gamma = 0.1)
svc.fit(X_train, y_train)
print("Training Accuracy", svc.score(X_train, y_train))
print("Testing Accuracy" , svc.score(X_test, y_test))

In [None]:
def top3(X_test, flag = False):
    predictions = svc.predict_proba(X_test)
    prediction_index = np.argsort(-predictions, axis=1)
    top_3_pred = prediction_index[:,:3]
    original_shape = top_3_pred.shape
    if flag:
        top_3_pred = encoder.inverse_transform(top_3_pred.reshape(-1, 1))
    top_3_pred = top_3_pred.reshape(original_shape)
    return top_3_pred

In [None]:
top_3 = top3(X_test)
mapk(y_test.reshape(-1, 1), top_3, k=3)

## Submission

In [None]:
test_df = pd.read_csv('/kaggle/input/playground-series-s3e13/test.csv')
test_df.head()

In [None]:
test = test_df.drop(['id'], axis=1)
test = pca.transform(test)

In [None]:
test = pd.DataFrame(test,columns=['pca'+ str(i) for i in range(1, 33, 1)])
top3_pred_test = top3(test, True)

In [None]:
test_df['prognosis'] = np.apply_along_axis(lambda x: np.array(' '.join(x), dtype="object"), 1, top3_pred_test)

In [None]:
test_df.to_csv('submission.csv', columns=['id', 'prognosis'], index=False)