# Importing libraries

In [20]:
import sys
sys.path.insert(1, '/gh/kaggle-pg-3x26')
import zipfile
import pandas as pd

from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.ensemble import RandomForestClassifier
#from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
import pickle
import src.functions as func
import requests


# Getting data

In [4]:
with zipfile.ZipFile("/gh/kaggle-pg-3x26/data/playground-series-s3e26.zip") as z:
    with z.open("sample_submission.csv") as f:
        sample_submission = pd.read_csv(f)
    with z.open("test.csv") as f:
        test = pd.read_csv(f)
    with z.open("train.csv") as f:
        train = pd.read_csv(f)

# Exploring data

In [5]:
sample_submission.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   id         5271 non-null   int64  
 1   Status_C   5271 non-null   float64
 2   Status_CL  5271 non-null   float64
 3   Status_D   5271 non-null   float64
dtypes: float64(3), int64(1)
memory usage: 164.8 KB


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7905 entries, 0 to 7904
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             7905 non-null   int64  
 1   N_Days         7905 non-null   int64  
 2   Drug           7905 non-null   object 
 3   Age            7905 non-null   int64  
 4   Sex            7905 non-null   object 
 5   Ascites        7905 non-null   object 
 6   Hepatomegaly   7905 non-null   object 
 7   Spiders        7905 non-null   object 
 8   Edema          7905 non-null   object 
 9   Bilirubin      7905 non-null   float64
 10  Cholesterol    7905 non-null   float64
 11  Albumin        7905 non-null   float64
 12  Copper         7905 non-null   float64
 13  Alk_Phos       7905 non-null   float64
 14  SGOT           7905 non-null   float64
 15  Tryglicerides  7905 non-null   float64
 16  Platelets      7905 non-null   float64
 17  Prothrombin    7905 non-null   float64
 18  Stage   

In [7]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             5271 non-null   int64  
 1   N_Days         5271 non-null   int64  
 2   Drug           5271 non-null   object 
 3   Age            5271 non-null   int64  
 4   Sex            5271 non-null   object 
 5   Ascites        5271 non-null   object 
 6   Hepatomegaly   5271 non-null   object 
 7   Spiders        5271 non-null   object 
 8   Edema          5271 non-null   object 
 9   Bilirubin      5271 non-null   float64
 10  Cholesterol    5271 non-null   float64
 11  Albumin        5271 non-null   float64
 12  Copper         5271 non-null   float64
 13  Alk_Phos       5271 non-null   float64
 14  SGOT           5271 non-null   float64
 15  Tryglicerides  5271 non-null   float64
 16  Platelets      5271 non-null   float64
 17  Prothrombin    5271 non-null   float64
 18  Stage   

In [8]:
test["id"].min()

7905

In [17]:
train.head(3).T

Unnamed: 0,0,1,2
id,0,1,2
N_Days,999,2574,3428
Drug,D-penicillamine,Placebo,Placebo
Age,21532,19237,13727
Sex,M,F,F
Ascites,N,N,N
Hepatomegaly,N,N,Y
Spiders,N,N,Y
Edema,N,N,Y
Bilirubin,2.3,0.9,3.3


In [9]:
sample_submission.head(3)

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,7905,0.628084,0.034788,0.337128
1,7906,0.628084,0.034788,0.337128
2,7907,0.628084,0.034788,0.337128


In [10]:
sample_submission["id"].min()

7905

In [11]:
sample_submission[["Status_C","Status_CL","Status_D"]].mean()

Status_C     0.628084
Status_CL    0.034788
Status_D     0.337128
dtype: float64

In [12]:
sample_submission[["Status_C","Status_CL","Status_D"]].min(), sample_submission[["Status_C","Status_CL","Status_D"]].max()

(Status_C     0.628084
 Status_CL    0.034788
 Status_D     0.337128
 dtype: float64,
 Status_C     0.628084
 Status_CL    0.034788
 Status_D     0.337128
 dtype: float64)

In [18]:
train["Status"].value_counts()

Status
C     4965
D     2665
CL     275
Name: count, dtype: int64

# One hot encoder in output

In [21]:
cat_encoder = OneHotEncoder(sparse_output=False)


In [23]:
cat_1hot = cat_encoder.fit_transform(train[["Status"]])
cat_1hot

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [25]:
cat_encoder.get_feature_names_out()

array(['Status_C', 'Status_CL', 'Status_D'], dtype=object)

In [26]:
df_output = pd.DataFrame(cat_encoder.transform(train[["Status"]]),
                         columns=cat_encoder.get_feature_names_out(),
                         index=train[["Status"]].index)

In [27]:
df_output.head(3), df_output.tail(3),train[["Status"]].head(3),train[["Status"]].tail(3)

(   Status_C  Status_CL  Status_D
 0       0.0        0.0       1.0
 1       1.0        0.0       0.0
 2       0.0        0.0       1.0,
       Status_C  Status_CL  Status_D
 7902       0.0        0.0       1.0
 7903       0.0        0.0       1.0
 7904       1.0        0.0       0.0,
   Status
 0      D
 1      C
 2      D,
      Status
 7902      D
 7903      D
 7904      C)

In [32]:
train[cat_encoder.get_feature_names_out()] = cat_1hot

In [33]:
train.head(3).T

Unnamed: 0,0,1,2
id,0,1,2
N_Days,999,2574,3428
Drug,D-penicillamine,Placebo,Placebo
Age,21532,19237,13727
Sex,M,F,F
Ascites,N,N,N
Hepatomegaly,N,N,Y
Spiders,N,N,Y
Edema,N,N,Y
Bilirubin,2.3,0.9,3.3


# Base line model

In [13]:
prediction = func.base_line(train)
prediction

Unnamed: 0,id,Status_C,Status_CL,Status_D
0,0,0.628084,0.034788,0.337128
1,1,0.628084,0.034788,0.337128
2,2,0.628084,0.034788,0.337128
3,3,0.628084,0.034788,0.337128
4,4,0.628084,0.034788,0.337128
...,...,...,...,...
7900,7900,0.628084,0.034788,0.337128
7901,7901,0.628084,0.034788,0.337128
7902,7902,0.628084,0.034788,0.337128
7903,7903,0.628084,0.034788,0.337128


In [34]:
prediction[["Status_C","Status_CL","Status_D"]].head(1)

Unnamed: 0,Status_C,Status_CL,Status_D
0,0.628084,0.034788,0.337128


In [35]:

error = log_loss(train[["Status_C","Status_CL","Status_D"]],prediction[["Status_C","Status_CL","Status_D"]])

In [36]:
error

0.7755024047843813

In [37]:
url = 'http://localhost:9695/predict'
customer = {
    "id": 532
}

response = requests.post(url, json=customer).json()
print(response)

{'Status_C': 0.628084, 'Status_CL': 0.034788, 'Status_D': 0.337128, 'id': 532}


In [38]:
prediction = func.base_line(test)

In [39]:
prediction.to_csv("../data/base-line-model-answer.csv",index=False)

In [None]:
# kaggle competitions submit -c playground-series-s3e25 -f submission.csv -m "Message"