### Importing all the required libraries

In [31]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest

In [32]:
# path for train and test data
train_data_path = 'datasets/train.csv'
test_data_path = 'datasets/test.csv' 

In [33]:
#Converting the data into pandas dataframes
dataset = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [34]:
# Prints shape of training data
print("Shape of the traning data is ",dataset.shape)

Shape of the traning data is  (252, 319)


In [35]:
# Prints shape of training data
print("Shape of the testing data is ",test_data.shape)

Shape of the testing data is  (104, 319)


In [36]:
# Display the first few rows of the dataset
dataset.head()

Unnamed: 0,Labels,ACAN,AGER,ALPK1,ANKRD17,APOB,APPL1,APPL2,ARRB2,ASGR1,...,UBE2N,UBE2V1,UBQLN1,UFD1,UNC93B1,USP17L2,VCAN,WDFY1,XIAP,ZCCHC3
0,0,13.6584,88.0205,402.542,1424.2625,3.794,1001.233,508.0148,567.201,13.6584,...,2437.6363,3413.2107,4135.4453,950.0104,1434.1269,0.0,31.8695,923.0769,1046.0021,819.1217
1,0,22.0844,86.3807,690.4861,2226.3285,20.4071,486.9744,1448.064,651.9083,11.182,...,2153.9253,1817.3762,3426.4325,1380.1196,815.7241,0.0,1651.0166,1473.7825,1618.8684,615.8465
2,0,32.7422,53.082,128.4882,1073.0497,0.0,654.8431,492.6206,512.9604,13.8906,...,3122.411,3720.6995,3000.3721,2681.3791,1429.2447,0.4961,693.5384,477.2417,1064.6161,804.1672
3,0,22.7975,42.1175,634.0804,3285.5487,0.0,574.9614,513.9104,1033.2303,31.6847,...,2663.4467,3217.9212,4131.3756,1178.864,1018.5471,0.0,675.0386,1596.5997,1666.1515,402.6275
4,0,2.5,134.0625,532.1875,936.5625,0.0,1933.4375,679.375,1175.625,6.5625,...,1898.4375,3009.1375,2549.375,1293.1156,744.6875,0.0,69.0625,1651.5625,1250.3125,982.5


In [49]:
test_data.head()

Unnamed: 0,ID,ACAN,AGER,ALPK1,ANKRD17,APOB,APPL1,APPL2,ARRB2,ASGR1,...,UBE2N,UBE2V1,UBQLN1,UFD1,UNC93B1,USP17L2,VCAN,WDFY1,XIAP,ZCCHC3
0,1001,91.3265,132.5226,331.5548,1658.7666,0.0,1068.6189,571.2868,699.8387,9.4305,...,1525.2513,3320.8537,2817.7193,1397.1957,367.7876,0.4963,699.3424,995.657,1219.0098,840.7991
1,1002,17.5971,183.2524,558.8592,2321.6019,0.6068,929.0049,783.9806,1039.4417,27.3058,...,2489.6845,2102.5485,3799.1505,906.5534,1187.5,0.0,1967.233,1529.733,1083.1311,510.3155
2,1003,120.9309,221.8022,331.8437,891.794,0.0,309.4915,403.4852,1430.5385,49.8625,...,1760.6628,4822.9092,3718.4831,1907.9521,1732.0061,0.0,1190.9692,1156.5812,710.6842,689.4783
3,1004,12.093,41.8605,886.5116,2216.7442,0.0,792.5581,1012.093,637.2093,3.7209,...,1872.5581,2342.0093,3280.9302,1115.3116,773.0233,0.0,63.2558,1065.1163,1385.1163,563.7209
4,1005,84.0622,34.4995,145.7726,1266.2779,0.4859,1362.4879,830.9038,1064.1399,85.5199,...,1932.9446,3220.9184,3153.5471,1168.6054,965.5005,0.0,1168.6103,581.6327,878.5228,607.8717


### Extracting features and target variables from the dataset

In [37]:
X = dataset.iloc[:, 1:]
y = dataset.iloc[:, 0]

### Feature Selection

In [38]:
# Using SelectKBest from the sklearn package to select the best featuers for the model. The value of K was hyperparameterized to find the best value which gave the highest AUC-ROC score.

selector = SelectKBest(k = 28)
X = selector.fit_transform(X, y)

### Training the model

In [39]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
# Initialized a Random Forest Classifier with a maximum depth of 10 and 1000 decision trees
clf = RandomForestClassifier(max_depth = 10, n_estimators = 100)
# Train (fit) the Random Forest Classifier on the training data
clf.fit(X_train, y_train)

### Generating probabilities for the validation

In [40]:
# Generated probability predictions for the test data using the trained classifier
y_pred = clf.predict_proba(X_test)

# Extracted the probabilities for the positive class (class 1) from the prediction results
y_pred = [i[1] for i in y_pred]

# Converted the list of probabilities to a NumPy array
y_pred = np.array(y_pred)

### Getting the ROC-AUC score

In [41]:
# Calculated the ROC AUC (Receiver Operating Characteristic Area Under the Curve) score
roc_auc = roc_auc_score(y_test, y_pred)
print(f'Validation ROC AUC: {roc_auc}')

Validation ROC AUC: 0.823076923076923


In [42]:
# Refitting the Random Forest Classifier on the entire dataset
clf.fit(X, y)

### Generating probabilities for the test dataset

In [46]:
# Used the same feature selector (selector) to transform the test data
test = selector.transform(test_data.iloc[:, 1:].values)



In [47]:
# Generated probability predictions for the test data using the trained classifier
y_pred = clf.predict_proba(test)

# Extracted the probabilities for the positive class from the prediction
y_pred = [i[1] for i in y_pred]

# Converted the list of probabilities to a NumPy array
y_pred = np.array(y_pred)

### Generating the CSV file with predictions

In [48]:
f = open('predictions.csv', 'w')
s = "ID,Labels\n"
c = 1001
for i in y_pred:
    s = s + c.__str__() + "," + i.__str__() + "\n"
    c += 1
f.write(s)
f.close()