In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, validation_curve
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import mutual_info_classif, SelectKBest
from sklearn.metrics import accuracy_score
from google.colab import drive
import matplotlib.pyplot as plt

# Mount Google Drive
drive.mount('/content/drive')

# Load the data
train1 = pd.read_csv('/content/drive/My Drive/training1.csv')
train2 = pd.read_csv('/content/drive/My Drive/training2.csv')
test = pd.read_csv('/content/drive/My Drive/test.csv')

# Combine training datasets
train = pd.concat([train1, train2], ignore_index=True)

# Separate features and labels
X = train.drop(['label', 'confidence'], axis=1)
y = train['label']
confidence = train['confidence']

# Define CNN and Gist feature columns
cnn_features = X.columns[:3072]
gist_features = X.columns[3072:]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val, confidence_train, confidence_val = train_test_split(X, y, confidence, test_size=0.2, random_state=42)

# Data imputation
imputer = SimpleImputer(strategy='mean')
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
test = imputer.transform(test)

# Data scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
test = scaler.transform(test)

# Separate CNN and Gist features in training and validation sets
X_train_cnn = X_train[:, :3072]
X_train_gist = X_train[:, 3072:]
X_val_cnn = X_val[:, :3072]
X_val_gist = X_val[:, 3072:]
test_cnn = test[:, :3072]
test_gist = test[:, 3072:]

# Feature selection using mutual information
mi_selector_cnn = SelectKBest(mutual_info_classif, k=50).fit(X_train_cnn, y_train)
X_train_cnn_selected = mi_selector_cnn.transform(X_train_cnn)
X_val_cnn_selected = mi_selector_cnn.transform(X_val_cnn)
test_cnn_selected = mi_selector_cnn.transform(test_cnn)

mi_selector_gist = SelectKBest(mutual_info_classif, k=50).fit(X_train_gist, y_train)
X_train_gist_selected = mi_selector_gist.transform(X_train_gist)
X_val_gist_selected = mi_selector_gist.transform(X_val_gist)
test_gist_selected = mi_selector_gist.transform(test_gist)

# Combine selected features
X_train_selected = np.hstack((X_train_cnn_selected, X_train_gist_selected))
X_val_selected = np.hstack((X_val_cnn_selected, X_val_gist_selected))
test_selected = np.hstack((test_cnn_selected, test_gist_selected))

# Use a smaller subset for hyperparameter tuning
X_train_tune, _, y_train_tune, _, confidence_train_tune, _ = train_test_split(X_train_selected, y_train, confidence_train, train_size=0.2, random_state=42)

# Hyperparameter tuning for GradientBoostingClassifier using RandomizedSearchCV
param_dist_gb = {
    'n_estimators': [100, 150],
    'learning_rate': [0.05, 0.1],
    'max_depth': [3, 4],
    'subsample': [0.8, 0.9],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}
gb = GradientBoostingClassifier()
gb_cv = RandomizedSearchCV(gb, param_dist_gb, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
gb_cv.fit(X_train_tune, y_train_tune)
best_gb = gb_cv.best_estimator_

# Hyperparameter tuning for RandomForestClassifier using RandomizedSearchCV
param_dist_rf = {
    'n_estimators': [100, 150],
    'max_features': ['sqrt'],
    'max_depth': [10, 15],
    'min_samples_split': [2, 4],
    'min_samples_leaf': [1, 2]
}
rf = RandomForestClassifier()
rf_cv = RandomizedSearchCV(rf, param_dist_rf, n_iter=5, cv=3, scoring='accuracy', n_jobs=-1, random_state=42)
rf_cv.fit(X_train_tune, y_train_tune)
best_rf = rf_cv.best_estimator_

# Validation curve for Gradient Boosting
param_range = np.arange(10, 101, 20)
train_scores, test_scores = validation_curve(
    GradientBoostingClassifier(),
    X_train_selected, y_train,
    param_name="n_estimators",
    param_range=param_range,
    cv=3,
    scoring="accuracy",
    n_jobs=-1
)

train_scores_mean = np.mean(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)

plt.figure()
plt.plot(param_range, train_scores_mean, label="Training score", color="r")
plt.plot(param_range, test_scores_mean, label="Cross-validation score", color="g")
plt.title("Validation Curve with Gradient Boosting")
plt.xlabel("Number of Estimators")
plt.ylabel("Accuracy")
plt.tight_layout()
plt.legend(loc="best")
plt.show()

# Performance for different training sets
training_sizes = [0.1, 0.2, 0.5, 0.8]
rf_accuracies = []
gb_accuracies = []

for size in training_sizes:
    X_train_sub, _, y_train_sub, _, confidence_sub, _ = train_test_split(X_train_selected, y_train, confidence_train, train_size=size, random_state=42)

    best_rf.fit(X_train_sub, y_train_sub)
    best_gb.fit(X_train_sub, y_train_sub)

    rf_pred = best_rf.predict(X_val_selected)
    gb_pred = best_gb.predict(X_val_selected)

    rf_accuracies.append(accuracy_score(y_val, rf_pred))
    gb_accuracies.append(accuracy_score(y_val, gb_pred))

plt.figure()
plt.plot(training_sizes, rf_accuracies, label="Random Forest", marker='o')
plt.plot(training_sizes, gb_accuracies, label="Gradient Boosting", marker='o')
plt.xlabel('Training Set Size (fraction)')
plt.ylabel('Validation Accuracy')
plt.title('Validation Accuracy vs Training Set Size')
plt.legend(loc='best')
plt.tight_layout()
plt.show()

# Effect of label confidence
# Train models with and without confidence weights
best_rf.fit(X_train_selected, y_train)
best_rf_weighted = RandomForestClassifier(**rf_cv.best_params_)
best_rf_weighted.fit(X_train_selected, y_train, sample_weight=confidence_train)

best_gb.fit(X_train_selected, y_train)
best_gb_weighted = GradientBoostingClassifier(**gb_cv.best_params_)
best_gb_weighted.fit(X_train_selected, y_train, sample_weight=confidence_train)

rf_pred = best_rf.predict(X_val_selected)
rf_weighted_pred = best_rf_weighted.predict(X_val_selected)
gb_pred = best_gb.predict(X_val_selected)
gb_weighted_pred = best_gb_weighted.predict(X_val_selected)

print(f"Random Forest Accuracy (without confidence): {accuracy_score(y_val, rf_pred)}")
print(f"Random Forest Accuracy (with confidence): {accuracy_score(y_val, rf_weighted_pred)}")
print(f"Gradient Boosting Accuracy (without confidence): {accuracy_score(y_val, gb_pred)}")
print(f"Gradient Boosting Accuracy (with confidence): {accuracy_score(y_val, gb_weighted_pred)}")

# Save these results into a table
results = pd.DataFrame({
    'Model': ['Random Forest', 'Random Forest Weighted', 'Gradient Boosting', 'Gradient Boosting Weighted'],
    'Accuracy': [
        accuracy_score(y_val, rf_pred),
        accuracy_score(y_val, rf_weighted_pred),
        accuracy_score(y_val, gb_pred),
        accuracy_score(y_val, gb_weighted_pred)
    ]
})

# Make predictions on the test set
test_predictions = best_rf.predict(test_selected)

# Save predictions in the required format
predictions_df = pd.DataFrame(test_predictions, columns=['prediction'])
predictions_df.to_csv('/content/drive/My Drive/predictions.csv', index=False)
