In [36]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import SimpleImputer

# Load the original training and test data
train_data = pd.read_csv('iith_foml_2023_train.csv')
test_data = pd.read_csv('iith_foml_2023_test.csv')

# Assuming the target column is named 'Target Variable (Discrete)'
X_train = train_data.drop('Target Variable (Discrete)', axis=1)
y_train = train_data['Target Variable (Discrete)']

# Impute missing values in the training data
imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train)

# Train the RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_imputed, y_train)

# Train the XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train_imputed, y_train)

# Train the Naive Bayes model (assuming Gaussian Naive Bayes for simplicity)
nb_model = GaussianNB()
nb_model.fit(X_train_imputed, y_train)

# Train the Support Vector Machine (SVM) model
svm_model = SVC()
svm_model.fit(X_train_imputed, y_train)

# Train the k-Nearest Neighbors (kNN) model
knn_model = KNeighborsClassifier()
knn_model.fit(X_train_imputed, y_train)

# Impute missing values in the test data
X_test_imputed = imputer.transform(test_data)

# Make predictions using each model
rf_predictions = rf_model.predict(X_test_imputed)
xgb_predictions = xgb_model.predict(X_test_imputed)
nb_predictions = nb_model.predict(X_test_imputed)
svm_predictions = svm_model.predict(X_test_imputed)
knn_predictions = knn_model.predict(X_test_imputed)

# Create separate DataFrames for each model's predictions
rf_df = pd.DataFrame({'ID': range(1, len(test_data) + 1), 'Category': rf_predictions})
xgb_df = pd.DataFrame({'ID': range(1, len(test_data) + 1), 'Category': xgb_predictions})
nb_df = pd.DataFrame({'ID': range(1, len(test_data) + 1), 'Category': nb_predictions})
svm_df = pd.DataFrame({'ID': range(1, len(test_data) + 1), 'Category': svm_predictions})
knn_df = pd.DataFrame({'ID': range(1, len(test_data) + 1), 'Category': knn_predictions})

# Save predictions to separate CSV files
rf_df.to_csv('rf_predictions.csv', index=False)#0.5
xgb_df.to_csv('xgb_predictions.csv', index=False)#0.5
nb_df.to_csv('nb_predictions.csv', index=False)#o.27
svm_df.to_csv('svm_predictions.csv', index=False)#0.26
knn_df.to_csv('knn_predictions.csv', index=False)#0.32


all these  predictions are submitted in kaggle and found their accuracy  and two of them random forest and XGB predictions is better than others
with score of ~50% so we will use this for further process

In [31]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier

# Load the original training and test data
train_data = pd.read_csv('iith_foml_2023_train.csv')
test_data = pd.read_csv('iith_foml_2023_test.csv')

# Combine the training and test data
combined_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

# Assuming the target column is named 'Target Variable (Discrete)'
y_train = train_data['Target Variable (Discrete)']

# Drop the target column for NN clustering
combined_data_for_nn = combined_data.drop('Target Variable (Discrete)', axis=1)

# Impute missing values in the combined data
imputer = SimpleImputer()
X_combined_imputed = imputer.fit_transform(combined_data_for_nn)

# Standardize the data
scaler = StandardScaler()
X_combined_standardized = scaler.fit_transform(X_combined_imputed)

# Split the data back into training and test sets
X_train_standardized = X_combined_standardized[:len(train_data)]
X_test_standardized = X_combined_standardized[len(train_data):]

# Create and train the neural network model
nn_model = MLPClassifier(hidden_layer_sizes=(64,32), max_iter=1000, alpha=0.001, random_state=42)

nn_model.fit(X_train_standardized, y_train)

# Perform NN clustering on the test data
test_predictions = nn_model.predict(X_test_standardized)

# Convert predictions to strings
test_predictions_str = test_predictions.astype(str)

# Create a DataFrame with 'ID' and 'Category_NN' columns for test predictions
result_df = pd.DataFrame({'ID': range(1, len(test_data) + 1), 'Category': test_predictions_str})

# Save the NN test predictions to a CSV file
result_df.to_csv('nn_test_predictions.csv', index=False)


Neural netowrk prediction also performs good with ~55% accuracy  

we used 6 prediction algo out of which 3 of them performed best of around ~ 55 %,52% 50 % consistently those are Neural networks, randomforest and XGB

**randomized sampling from 3 input predictions**


prediction algo~ randomized sampling + accuracy estimator

usualy accuracy estimator can be definead as a function ,where as we used kaggle submissions as accuracy estimation

In [32]:
import pandas as pd
import random

def merge_and_update(input_files, output_file, iterations):
    # Read the initial CSV files
    csv_files = [pd.read_csv(file) for file in input_files]

    for _ in range(iterations):
        # Merge the dataframes on the 'ID' column
        merged_df = pd.merge(csv_files[0], csv_files[1], on='ID')
        merged_df = pd.merge(merged_df, csv_files[2], on='ID')

        # Randomly select a value from the three categories for each ID
        merged_df['Category'] = merged_df.apply(lambda row: random.choice([row['Category_x'], row['Category_y'], row['Category']]), axis=1)

        # Drop unnecessary columns
        merged_df = merged_df[['ID', 'Category']]

        # Update one of the input files with the new merged and randomized data
        updated_file_index = random.randint(0, 2)
        csv_files[updated_file_index] = merged_df.copy()

    # Save the final result to the specified output file
    merged_df.to_csv(output_file, index=False)

# Run the convergence process for 10 iterations (adjust as needed)
merge_and_update(['rf_predictions.csv', 'xgb_predictions.csv', 'nn_test_predictions.csv'], 'result1.csv', iterations=1000)


**intuition:**


take 3 input csv file and produce a randomized csv made up of prediction values from these file replace the worst scored input with a better scored output (we performed this for a number of time to get better scores )

the best result we got  yet from this randomised selection algo - result2.csv of 68.3%, result4.csv- 69% and result1.csv 68.8%

NOTE: this prediction took a lot of trial and error to correctly predict , get a better prediction accuracy but given a no of iterations of this method will provide better results ( we got ours after 3 rd iteration after which score seemed to decrease )


the next process would be replace the csv file with lower accuracy with better one and observe the randomized algo to provide with a predicted csv file which might have better accuracy than others

In [33]:

merge_and_update(['result2.csv', 'result1.csv', 'nn_test_predictions.csv'], 'result5.csv', iterations=1000)


result5.csv- 0.72

In [34]:
merge_and_update(['result5.csv', 'result1.csv', 'nn_test_predictions.csv'], 'result6.csv', iterations=1000)

result6.csv-0.63

In [35]:
merge_and_update(['result5.csv', 'result1.csv', 'result4.csv'], 'result7.csv', iterations=1000)

**why this randomised algo works ?**

we initialy took 3 csv files with accuracy >51% so randomly picking from these 3 files might provide higher accuracy than other or lower than others


by iteratively replacing the lowest scored value we actually improving prediction accuracy , if the prediction accuracy is worst than the 3 inputs just ignore the result and iterate once again


Note : this method tends to reach a maximum score above which prediction accuracy cant be acquired and that for our case is 72.096% this algo may not provide the same output as us but it will give better score based on no of submissions( we executed and submitted 55 times to improve score ˙◠˙)