In [1]:
import numpy as np
import joblib
from sklearn.preprocessing import MultiLabelBinarizer as mlb
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score

# from google.colab import drive
# drive.mount('/content/drive')

file_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\IE506_2024_progchallenge_train.txt'
with open(file_path, 'r') as file:
  train_file_contents = file.read()

In [2]:
lines = train_file_contents.split('\n')
mlb_main_labels = mlb(classes = [i for i in range(100, 901, 100)])
mlb_sub_labels = mlb(classes = [i for i in range(100, 1000) if i % 100 != 0])
X_dict = []
M_labels = []
S_labels = []
for line in lines:
    parts = line.split()
    M = parts[0][2:] # extracting M labels
    S = parts[1][2:] # extracting S labels
    M = M.split(',') # split each M label
    S = S.split(',') # split each S label

    # convert M and S labels from strings to integers
    M_int = []
    for _ in M:
        M_int.append(int(_))
    M_labels.append(M_int)

    S_int = []
    for _ in S:
        S_int.append(int(_))
    S_labels.append(S_int)

    feature_values = parts[2:] # extract all features
    features = {} # dictionary to store individual parameter and its value
    for fv in feature_values:
        feature_index, feature_val = fv.split(':') # separating feature index and corresponding value
        features[int(feature_index)] = float(feature_val) # convert feature index to integer and value to float from strings
    X_dict.append(features)
M_labels_encoded = mlb_main_labels.fit_transform(M_labels)
S_labels_encoded = mlb_sub_labels.fit_transform(S_labels)
print('M_enc:', len(M_labels_encoded))
print('S_enc:', len(S_labels_encoded))
print('X_dict:', len(X_dict))
# print(len(S_labels_encoded[102356]))

M_enc: 200000
S_enc: 200000
X_dict: 200000


In [3]:
# creating the full data matrices
# join the M and S labels into Y full
Y_full = np.hstack([M_labels_encoded, S_labels_encoded])
# convert the X_dict to sparse matrix of data features
vectorizer = DictVectorizer(sparse=True)
X_full = vectorizer.fit_transform(X_dict)
print(Y_full.shape)
print(X_full.shape)
feature_mapping = vectorizer.get_feature_names_out()

(200000, 900)
(200000, 47210)


In [4]:
# split the train and holdout data
X_train, X_holdout, Y_train, Y_holdout = train_test_split(X_full, Y_full, test_size = 0.3, random_state = 64)
""" we'll split our X_train into two equal sets, X_train_1 and X_train_2; we'll train each of them separately
and later use stacking to combine the results of both the models to form the final meta_model"""
X_train_1, X_train_2, Y_train_1, Y_train_2 = train_test_split(X_train, Y_train, test_size = 0.5, random_state = 128)

In [5]:
# loding the numpy arrays
pred_1_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_1\\1_holdout.csv'
pred_2_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_2\\2_holdout.csv'
y_pred_1_holdout = np.loadtxt(pred_1_path, delimiter = ',', dtype = int)
y_pred_2_holdout = np.loadtxt(pred_2_path, delimiter = ',', dtype = int)
y_meta_train = np.hstack([y_pred_1_holdout, y_pred_2_holdout])
print(y_meta_train.shape)

(60000, 1800)


In [6]:
meta_model_rf = RandomForestClassifier(n_estimators = 60)
meta_model_rf.fit(y_meta_train, Y_holdout)

In [7]:
y_pred_meta_holdout = meta_model_rf.predict(y_meta_train)
accuracy = accuracy_score(Y_holdout, y_pred_meta_holdout)
error_rate = 1 - accuracy
precision = precision_score(Y_holdout, y_pred_meta_holdout, average='micro')
recall = recall_score(Y_holdout, y_pred_meta_holdout, average='micro')
f1score = f1_score(Y_holdout, y_pred_meta_holdout, average='micro')
print("Accuracy:", accuracy)
print("Error rate:", error_rate)
print("Precision, micro:", precision)
print("recall, micro:", recall)
print("f1score, micro:", f1score)

Accuracy: 0.6054
Error rate: 0.39459999999999995
Precision, micro: 0.9518238621592304
recall, micro: 0.7059695736588739
f1score, micro: 0.8106663612590429


In [8]:
pred_1_full_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_1\\1_full.csv'
pred_2_full_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_2\\2_full.csv'
y_pred_1_full = np.loadtxt(pred_1_full_path, delimiter = ',', dtype = int)
y_pred_2_full = np.loadtxt(pred_2_full_path, delimiter = ',', dtype = int)
y_meta_test = np.hstack([y_pred_1_full, y_pred_2_full])
print(y_meta_test.shape)

(200000, 1800)


In [9]:
y_pred_meta_full = meta_model_rf.predict(y_meta_test)
accuracy = accuracy_score(Y_full, y_pred_meta_full)
error_rate = 1 - accuracy
precision = precision_score(Y_full, y_pred_meta_full, average='micro')
recall = recall_score(Y_full, y_pred_meta_full, average='micro')
f1score = f1_score(Y_full, y_pred_meta_full, average='micro')
print("Accuracy:", accuracy)
print("Error rate:", error_rate)
print("Precision, micro:", precision)
print("recall, micro:", recall)
print("f1score, micro:", f1score)

Accuracy: 0.770755
Error rate: 0.22924500000000003
Precision, micro: 0.9674017871426286
recall, micro: 0.8442009620523784
f1score, micro: 0.9016121440088264


In [7]:
import os
folder_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\meta_model'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

In [9]:
# np.savetxt(folder_path+'\\meta_holdout.csv', y_pred_meta_holdout, delimiter = ',', fmt='%d')
# np.savetxt(folder_path+'\\meta_full.csv', y_pred_meta_full, delimiter = ',', fmt='%d')
pred_1_actual_test_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_1\\1_actual_test.csv'
pred_2_actual_test_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_2\\2_actual_test.csv'
y_pred_1_actual_test = np.loadtxt(pred_1_actual_test_path, delimiter = ',', dtype = int)
y_pred_2_actual_test = np.loadtxt(pred_2_actual_test_path, delimiter = ',', dtype = int)
y_meta_test = np.hstack([y_pred_1_actual_test, y_pred_1_actual_test])
print(y_meta_test.shape)
y_pred_meta_actual_test = meta_model_rf.predict(y_meta_test)
np.savetxt(folder_path+'\\meta_actual_test.csv', y_pred_meta_actual_test, delimiter = ',', fmt='%d')

(150000, 1800)
