In [1]:
import numpy as np
import joblib
from sklearn.preprocessing import MultiLabelBinarizer as mlb
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score
from scipy.sparse import load_npz

file_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\IE506_2024_progchallenge_train.txt'
with open(file_path, 'r') as file:
  train_file_contents = file.read()

In [2]:
lines = train_file_contents.split('\n')
mlb_main_labels = mlb(classes = [i for i in range(100, 901, 100)])
mlb_sub_labels = mlb(classes = [i for i in range(100, 1000) if i % 100 != 0])
X_dict = []
M_labels = []
S_labels = []
for line in lines:
    parts = line.split()
    M = parts[0][2:] # extracting M labels
    S = parts[1][2:] # extracting S labels
    M = M.split(',') # split each M label
    S = S.split(',') # split each S label

    # convert M and S labels from strings to integers
    M_int = []
    for _ in M:
        M_int.append(int(_))
    M_labels.append(M_int)

    S_int = []
    for _ in S:
        S_int.append(int(_))
    S_labels.append(S_int)

    feature_values = parts[2:] # extract all features
    features = {} # dictionary to store individual parameter and its value
    for fv in feature_values:
        feature_index, feature_val = fv.split(':') # separating feature index and corresponding value
        features[int(feature_index)] = float(feature_val) # convert feature index to integer and value to float from strings
    X_dict.append(features)
M_labels_encoded = mlb_main_labels.fit_transform(M_labels)
S_labels_encoded = mlb_sub_labels.fit_transform(S_labels)
print('M_enc:', len(M_labels_encoded))
print('S_enc:', len(S_labels_encoded))
print('X_dict:', len(X_dict))
# print(len(S_labels_encoded[102356]))

M_enc: 200000
S_enc: 200000
X_dict: 200000


In [3]:
# creating the full data matrices
# join the M and S labels into Y full
Y_full = np.hstack([M_labels_encoded, S_labels_encoded])
# convert the X_dict to sparse matrix of data features
vectorizer = DictVectorizer(sparse=True)
X_full = vectorizer.fit_transform(X_dict)
print(Y_full.shape)
print(X_full.shape)
feature_mapping = vectorizer.get_feature_names_out()

(200000, 900)
(200000, 47210)


In [4]:
# split the train and holdout data
X_train, X_holdout, Y_train, Y_holdout = train_test_split(X_full, Y_full, test_size = 0.3, random_state = 64)
""" we'll split our X_train into two equal sets, X_train_1 and X_train_2; we'll train each of them separately
and later use stacking to combine the results of both the models to form the final meta_model"""
X_train_1, X_train_2, Y_train_1, Y_train_2 = train_test_split(X_train, Y_train, test_size = 0.5, random_state = 128)

In [8]:
# Load the sparse matrix
X_test = load_npz("C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\X_test.npz")
# print(X_test)

In [6]:
model_1_rf = RandomForestClassifier(n_estimators = 60)
model_1_rf.fit(X_train_1, Y_train_1)

In [7]:
y_pred_1_1 = model_1_rf.predict(X_train_1)
accuracy = accuracy_score(Y_train_1, y_pred_1_1)
error_rate = 1 - accuracy
precision = precision_score(Y_train_1, y_pred_1_1, average='micro')
recall = recall_score(Y_train_1, y_pred_1_1, average='micro')
f1score = f1_score(Y_train_1, y_pred_1_1, average='micro')
print("Accuracy:", accuracy)
print("Error rate:", error_rate)
print("Precision, micro:", precision)
print("recall, micro:", recall)
print("f1score, micro:", f1score)

Accuracy: 0.9940857142857142
Error rate: 0.005914285714285761
Precision, micro: 0.998781764211321
recall, micro: 0.9969343540735413
f1score, micro: 0.9978572040798834


In [8]:
y_pred_1_2 = model_1_rf.predict(X_train_2)
accuracy = accuracy_score(Y_train_2, y_pred_1_2)
error_rate = 1 - accuracy
precision = precision_score(Y_train_2, y_pred_1_2, average='micro')
recall = recall_score(Y_train_2, y_pred_1_2, average='micro')
f1score = f1_score(Y_train_2, y_pred_1_2, average='micro')
print("Accuracy:", accuracy)
print("Error rate:", error_rate)
print("Precision, micro:", precision)
print("recall, micro:", recall)
print("f1score, micro:", f1score)

Accuracy: 0.5546857142857143
Error rate: 0.44531428571428566
Precision, micro: 0.9597300166843622
recall, micro: 0.6476274415034259
f1score, micro: 0.7733780585932593


In [9]:
y_pred_1_holdout = model_1_rf.predict(X_holdout)
accuracy = accuracy_score(Y_holdout, y_pred_1_holdout)
error_rate = 1 - accuracy
precision = precision_score(Y_holdout, y_pred_1_holdout, average='micro')
recall = recall_score(Y_holdout, y_pred_1_holdout, average='micro')
f1score = f1_score(Y_holdout, y_pred_1_holdout, average='micro')
print("Accuracy:", accuracy)
print("Error rate:", error_rate)
print("Precision, micro:", precision)
print("recall, micro:", recall)
print("f1score, micro:", f1score)

Accuracy: 0.55645
Error rate: 0.44355
Precision, micro: 0.9592055138828889
recall, micro: 0.6495983935742972
f1score, micro: 0.7746107005803812


In [18]:
import os
folder_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_1'
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
# np.savetxt(folder_path+'\\1_1.csv', y_pred_1_1, delimiter = ',', fmt='%d')
# np.savetxt(folder_path+'\\1_2.csv', y_pred_1_2, delimiter = ',', fmt='%d')
np.savetxt(folder_path+'\\1_holdout.csv', y_pred_1_holdout, delimiter = ',', fmt='%d')


In [27]:
# del y_pred_1_holdout

In [28]:
y_pred_1_full = model_1_rf.predict(X_full)
accuracy = accuracy_score(Y_full, y_pred_1_full)
error_rate = 1 - accuracy
precision = precision_score(Y_full, y_pred_1_full, average='micro')
recall = recall_score(Y_full, y_pred_1_full, average='micro')
f1score = f1_score(Y_full, y_pred_1_full, average='micro')
print("Accuracy:", accuracy)
print("Error rate:", error_rate)
print("Precision, micro:", precision)
print("recall, micro:", recall)
print("f1score, micro:", f1score)

Accuracy: 0.709005
Error rate: 0.290995
Precision, micro: 0.9768646504910603
recall, micro: 0.7702441826144532
f1score, micro: 0.8613365120534524


In [29]:
np.savetxt(folder_path+'\\1_full.csv', y_pred_1_full, delimiter = ',', fmt='%d')

In [30]:
# del y_pred_1_full

In [5]:
folder_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_1'
feature_mapping = vectorizer.get_feature_names_out()
print(feature_mapping[376])

377


In [6]:
import random
random.seed(512)
random_numbers = [random.randint(0, 47209) for _ in range(100)]

keys_1_obj = feature_mapping[random_numbers]
print(random_numbers)
keys_1 = []
for _ in range(len(keys_1_obj)):
    keys_1.append(int(keys_1_obj[_]))
print(keys_1)
keys_1_with_features = np.vstack([random_numbers, keys_1])
np.savetxt(folder_path+'\\keys_1_1.csv', keys_1_with_features, delimiter = ',')

[1244, 28787, 9299, 21657, 27000, 23713, 40868, 26577, 11115, 14428, 15814, 15781, 298, 6510, 32705, 36775, 26904, 8104, 34143, 23437, 46667, 1253, 17261, 6238, 40908, 39348, 43645, 17805, 1381, 15908, 4460, 44871, 13642, 23773, 46873, 11546, 29485, 46399, 32328, 45777, 39368, 35331, 1461, 11627, 668, 9463, 38832, 29347, 20052, 12816, 34160, 36361, 27690, 9104, 45491, 14363, 39475, 10841, 45516, 30968, 5107, 36575, 4942, 12067, 23007, 41903, 37560, 42685, 39050, 10429, 24728, 8164, 44121, 5995, 35471, 14784, 42800, 6313, 3833, 36518, 415, 1981, 2651, 33509, 39566, 1358, 24323, 46548, 11607, 12736, 5239, 19036, 21164, 36807, 29497, 18443, 15624, 15371, 33098, 44802]
[1245, 28803, 9305, 21669, 27015, 23727, 40890, 26592, 11122, 14436, 15824, 15791, 299, 6515, 32724, 36795, 26919, 8110, 34163, 23451, 46693, 1254, 17272, 6243, 40931, 39370, 43670, 17816, 1382, 15918, 4463, 44896, 13650, 23787, 46899, 11553, 29501, 46425, 32347, 45802, 39390, 35351, 1462, 11634, 669, 9469, 38854, 29363, 200

In [9]:
random.seed(1024)
random_numbers = [random.randint(0, 47209) for _ in range(100)]

keys_1_obj = feature_mapping[random_numbers]
print(random_numbers)
keys_1 = []
for _ in range(len(keys_1_obj)):
    keys_1.append(int(keys_1_obj[_]))
print(keys_1)
keys_1_with_features = np.vstack([random_numbers, keys_1])
np.savetxt(folder_path+'\\keys_1_2.csv', keys_1_with_features, delimiter = ',')

[1253, 31686, 25521, 21182, 34064, 6582, 29171, 33612, 23979, 46921, 24171, 25454, 6292, 46624, 9185, 25527, 6703, 26845, 10162, 28587, 31060, 29243, 34402, 22751, 40405, 39838, 5785, 2304, 5474, 41709, 5379, 6710, 23727, 45170, 21911, 222, 24300, 19715, 11958, 8836, 10461, 25503, 18732, 10556, 5954, 15008, 38181, 7929, 30113, 44682, 36811, 1553, 45726, 26899, 1411, 1405, 16187, 10150, 21867, 22546, 30235, 40003, 11413, 36002, 45325, 27932, 3989, 4395, 25136, 6447, 39086, 25493, 12286, 14045, 46246, 2252, 12278, 40497, 43075, 21640, 19726, 26892, 30463, 33804, 5381, 40739, 23757, 6217, 16818, 2503, 4564, 39124, 11948, 8226, 34695, 31818, 21219, 27803, 30215, 23176]
[1254, 31704, 25536, 21194, 34084, 6587, 29187, 33632, 23993, 46947, 24186, 25469, 6297, 46650, 9191, 25542, 6708, 26860, 10169, 28602, 31078, 29259, 34422, 22764, 40427, 39860, 5790, 2305, 5478, 41732, 5383, 6715, 23741, 45195, 21924, 223, 24315, 19727, 11965, 8842, 10468, 25518, 18743, 10563, 5959, 15016, 38202, 7935, 3013

In [9]:
folder_path = 'C:\\Users\\abhis\\Desktop\\IE506\\Challenge\\ie-506-2024-programming-challenge\\\\outputs\\model_1'
y_pred_1_actual_test = model_1_rf.predict(X_test)
np.savetxt(folder_path+'\\1_actual_test.csv', y_pred_1_actual_test, delimiter = ',', fmt='%d')