In [1]:
import os
import numpy as np
import pandas as pd
import sys
import joblib
from tqdm import tqdm
from timeit import default_timer as timer
from matplotlib import pyplot as plt
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, make_scorer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier
from time import sleep
import subprocess

In [5]:
dataset_path = r"/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset"
train_txt_path  = r"/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/IE506_2024_progchallenge_train.txt"
test_txt_path = r"/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/IE506_2024_progchallenge_test.txt"
load_features_path = "/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/csr_feature.npz"
load_labels_path = "/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/csr_label.npz"
load_cols_F_path = "/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/cols_F.joblib"
load_cols_C_path = "/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/cols_C.joblib"
load_features_sub_path = r"/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/csr_feature_sub.npz"
model_save_path = r"/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/models/models/"

In [6]:
features_arr = load_npz(load_features_path); print("features_arr: ", features_arr.shape)
labels_arr = load_npz(load_labels_path); print("labels_arr: ", labels_arr.shape)
cols_F = joblib.load(load_cols_F_path); print("cols_F: ", len(cols_F))
cols_C = joblib.load(load_cols_C_path); print("cols_C: ", len(cols_C))
X_sub = load_npz(load_features_sub_path); print("X_sub: ", X_sub.shape)
print("loaded..........................")

features_arr:  (200000, 47236)
labels_arr:  (200000, 41)
cols_F:  47236
cols_C:  41
X_sub:  (150000, 47236)
loaded..........................


\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\

In [7]:
MODEL_NAME = "MOCCGS_L2LR"

print(f"-----------------------Loading Model {MODEL_NAME}----------------------------------------------")
clf = joblib.load(f"{model_save_path}/{MODEL_NAME}.joblib")
# print(f"Size of clf: {sys.getsizeof(clf)/(1024**3)}")

-----------------------Loading Model MOCCGS_L2LR----------------------------------------------


In [None]:
print("-------------------------Doing Prediction--------------------------------------------")
X_train = features_arr
Y_train = np.squeeze(labels_arr.toarray())
train_pred = clf.predict(X_train)
sub_pred = clf.predict(X_sub)

In [None]:
print("------------------------------Calculating Accuracy---------------------------------------")
train_acc = 100*np.sum(np.sum((train_pred == Y_train), axis=1)/train_pred.shape[1])/train_pred.shape[0]
print(f"Train acc: {train_acc} %")


print("--------------------------Calculating Acc and F1-------------------------------------------")
print(str("train_acc").ljust(20), str("train_f1").ljust(20))
for i in range(train_pred.shape[1]):
    train_acc = 100*accuracy_score(Y_train[:,i], train_pred[:, i])
    train_f1 = 100*f1_score(Y_train[:,i], train_pred[:, i], average='micro')
    
    if i<9:
        print(f" {i+1}", str(train_acc).ljust(20), str(train_f1).ljust(20))
    else:
        print(i+1, str(train_acc).ljust(20), str(train_f1).ljust(20))    

In [18]:
print("--------------------------Making Submission File-------------------------------------------")
def make_submission_csv(pred_matrix, Class_arr, savename, break_=True):
    open(f"{dataset_path}/{savename}.csv", 'w').write("ID,M,S\n")
    mispred_count = 0
    with open(f"{dataset_path}/{savename}.csv", 'a') as f:
        for i in range(pred_matrix.shape[0]):
            M_list, S_list = [], []
            sample_pred = pred_matrix[i,:]
            for j,(label,pred) in enumerate(zip(Class_arr, sample_pred)):
                Class, idx  = label.split("_")
                # print(i, j, label, pred, Class, idx)
                if (Class == 'M') and (pred != 0):
                    M_list.append(int(idx))
                elif (Class == 'S') and (pred != 0):
                    S_list.append(int(idx))

            S_list_idx = [entry for entry in range(0, len(S_list))]
            for m in M_list:
                a = m + 1
                b = m + 99
                for sid, s in enumerate(S_list):
                    if (a <= s <= b):
                        S_list_idx.remove(sid)
                        
            S_list = np.array(S_list)
            S_list_idx = np.array(S_list_idx)
            mispred_count += len(S_list_idx)
            if len(S_list_idx) != 0:
                print(f"{i}, {M_list}, {S_list}, Invalid S: {S_list[S_list_idx]}---------------------------------------------------------------------------")
                
            if len(S_list_idx) != 0:
                print([S_list[i] for i in range(len(S_list)) if i not in S_list_idx])
                
            S_list = [S_list[i] for i in range(len(S_list)) if i not in S_list_idx]
            
            M_list = [f"{m}" for m in M_list]
            S_list = [f"{s}" for s in S_list]
            if (len(M_list) > 1) or (len(S_list) > 1):
                M_list = f"\"{M_list}\""
                S_list = f"\"{S_list}\""
            file_entry = f"{i},{M_list},{S_list}"
            # print(file_entry)
            f.write(f"{file_entry}\n")
            if (i == 4) and break_:
                break
    print("No of mispred: ", mispred_count)
    print(pd.read_csv(f"{dataset_path}/{savename}.csv"))

# voting_preds = np.load(f"{dataset_path}/voting_preds.npy")[0]
# make_submission_csv(voting_preds, cols_C, f"submission_voting_MOCCGS_L2LR_MOclfGB_MOclfL2LR_MOCCGS_L1LR_liblinear", break_=False)
clf = joblib.load("/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/models/models/CCXGB_102.joblib")
pred_maxtri = clf.predict(X_sub)
make_submission_csv(pred_maxtri, cols_C, f"submission_CCXGB_102", break_=False)

--------------------------Making Submission File-------------------------------------------




No of mispred:  0
            ID        M   S
0            0  ['100']  []
1            1  ['100']  []
2            2  ['100']  []
3            3       []  []
4            4       []  []
...        ...      ...  ..
149995  149995       []  []
149996  149996       []  []
149997  149997  ['100']  []
149998  149998  ['100']  []
149999  149999       []  []

[150000 rows x 3 columns]


In [19]:
cmd1 = f"kaggle competitions leaderboard ie-506-2024-programming-challenge -s"
cmd2 = f'kaggle competitions submit -c ie-506-2024-programming-challenge -f "/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/submission_CCXGB_102.csv" -m submission_CCXGB_102'
print(cmd2)

kaggle competitions submit -c ie-506-2024-programming-challenge -f "/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/submission_CCXGB_102.csv" -m submission_CCXGB_102


In [20]:
subprocess.run(cmd1, shell=True)
subprocess.run(cmd2, shell=True)

  teamId  teamName                     submissionDate       score      
--------  ---------------------------  -------------------  ---------  
11918923  NeuroChem                    2024-04-23 03:51:34  0.8810956  
11921125  SaurabhMehra2001             2024-05-05 04:18:28  0.8726533  
11918975  Yalla Chandra Sri Veerendra  2024-04-28 08:48:41  0.8712938  
11981387  rupesh33                     2024-05-04 06:23:31  0.8708639  
12019972  SJ                           2024-05-04 14:14:33  0.8634600  
11918927  Mayur09                      2024-05-05 15:21:51  0.8626137  
11918872  Aashish                      2024-05-05 21:03:50  0.8618084  
11985911  Sachin Yadav                 2024-05-04 19:42:05  0.8596516  
12028801  Sangram123                   2024-05-04 18:41:28  0.8596409  
12061126  JAYANT KUMAR JHA             2024-05-04 15:49:51  0.8595774  
11919489  Harsh Bundeliya              2024-05-04 10:41:47  0.8576460  
11919256  Mohan Krishna katta          2024-05-05 19:41:32  0.85

100%|██████████| 2.03M/2.03M [00:03<00:00, 690kB/s]


Successfully submitted to IE 506 (2024) Programming Challenge

CompletedProcess(args='kaggle competitions submit -c ie-506-2024-programming-challenge -f "/home/23m1521/ashish/Kaggle/_3_IE506_2024_Programming_Challenge/dataset/submission_CCXGB_102.csv" -m submission_CCXGB_102', returncode=0)