In [7]:
import numpy as np 
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [8]:
train = pd.read_parquet("train_final.parquet")
test = pd.read_parquet("test_final.parquet")
submit = pd.read_parquet("submission_sample_final.parquet")

top_n = 15  

top_carriers = train["carrier"].value_counts().head(top_n).index.tolist()

train["carrier_encoded"] = train["carrier"].apply(lambda x: x if x in top_carriers else "Other")

label_encoder = LabelEncoder()
train["carrier_encoded"] = label_encoder.fit_transform(train["carrier_encoded"])
top_devicebrand = train["devicebrand"].value_counts().head(20).index.tolist()

train["devicebrand_encoded"] = train["devicebrand"].apply(lambda x: x if x in top_devicebrand else "Other")

label_encoder = LabelEncoder()
train["devicebrand_encoded"] = label_encoder.fit_transform(train["devicebrand_encoded"])

train = train.drop(columns=['id'])
train = train.drop(columns=['carrier'])
train = train.drop(columns=['devicebrand'])
train[['Ilk_menu', 'Ikinci_menu', 'Ucuncu_menu']] = train['target'].str.split(',', expand=True)
train = train.drop(columns=['target'])

top_carriers = test["carrier"].value_counts().head(15).index.tolist()

test["carrier_encoded"] = test["carrier"].apply(lambda x: x if x in top_carriers else "Other")

label_encoder = LabelEncoder()
test["carrier_encoded"] = label_encoder.fit_transform(test["carrier_encoded"])

top_carriers = test["devicebrand"].value_counts().head(20).index.tolist()

test["devicebrand_encoded"] = test["devicebrand"].apply(lambda x: x if x in top_carriers else "Other")

label_encoder = LabelEncoder()
test["devicebrand_encoded"] = label_encoder.fit_transform(test["devicebrand_encoded"])

test = test.drop(columns=['id'])
test = test.drop(columns=['carrier'])
test = test.drop(columns=['devicebrand'])

menu_mapping1 = {
    ' menu1': 1,
    ' menu2': 2,
    ' menu3': 3,
    ' menu4': 4,
    ' menu5': 5,
    ' menu6': 6,
    ' menu7': 7,
    ' menu8': 8,
    ' menu9': 9,
}
menu_mapping = {
    'menu1': 1,
    'menu2': 2,
    'menu3': 3,
    'menu4': 4,
    'menu5': 5,
    'menu6': 6,
    'menu7': 7,
    'menu8': 8,
    'menu9': 9,
}
train['Ilk_menu'] = train['Ilk_menu'].replace(menu_mapping)
train['Ikinci_menu'] = train['Ikinci_menu'].replace(menu_mapping1)
train['Ucuncu_menu'] = train['Ucuncu_menu'].replace(menu_mapping1)

In [9]:
df = train 

X = df.drop(columns=['Ilk_menu', 'Ikinci_menu', 'Ucuncu_menu'])
y = df[['Ilk_menu', 'Ikinci_menu', 'Ucuncu_menu']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

classifiers = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
}

def jaccard_similarity_score(y_true, y_pred):
    jaccard_scores = []
    for i in range(len(y_true)):
        intersection = np.sum(np.logical_and(y_true[i], y_pred[i]))
        union = np.sum(np.logical_or(y_true[i], y_pred[i]))
        jaccard_score = intersection / union
        jaccard_scores.append(jaccard_score)
    return np.mean(jaccard_scores)

import numpy as np

def process_predictions(y_pred, y_test):
    y_test = np.array(y_test)

    def to_binary_representation(data):
        joined_array = np.array([', '.join(map(str, row)) for row in data])
        all_menus = ['1', '2', '3', '4', '5', '6', '7', '8', '9']
        binary_rep = [[1 if menu in entry else 0 for menu in all_menus] for entry in joined_array]
        return np.array(binary_rep)

    y_pred_binary = to_binary_representation(y_pred)
    y_test_binary = to_binary_representation(y_test)

    return y_test_binary, y_pred_binary

for name, clf in classifiers.items():
    print(f"Evaluating {name}...")

    model = MultiOutputClassifier(clf)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    y_true, y_pred = process_predictions(y_pred, y_test.values)

    score = jaccard_similarity_score(y_true, y_pred)
    print(f"Jaccard Score for {name}: {score}")

Evaluating Random Forest...
Jaccard Score for Random Forest: 0.5416764132553606


In [31]:
len(y_pred)

18810

In [33]:
len(y_true)

18810