# Indepndent Component Analysis on the NCAA dataset

scitkit learn random projection: https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.FastICA.html

## Load libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from sklearn.decomposition import FastICA
from sklearn.preprocessing import normalize
from sklearn.neural_network import MLPClassifier
from scipy.stats import kurtosis

## Set directories

In [2]:
directory_hw1 = "/Users/mikepecorino/Documents/machine_learning/HW1/"
directory_hw3 = "/Users/mikepecorino/Documents/machine_learning/HW3/"

## Load inputs

In [3]:
data_all = pd.read_csv(directory_hw3 + "ncaa_all.csv")

## Train/Test split

In [4]:
train = data_all[data_all["tag"].isin(["train", "valid"])]
test = data_all[data_all["tag"] == "test"]

## Define features and response variable

### Features list

In [5]:
features = ["game_win_perc_prop", 
            "game_starters_total_minutes_prop", "game_starters_prop_minutes_prop", "game_player_pts_10plus_prop",
            "game_player_pts_15plus_prop", "game_player_pts_16plus_prop", "game_player_pts_17plus_prop",
            "game_player_pts_18plus_prop", "game_player_pts_19plus_prop", "game_player_pts_20plus_prop",
            "game_player_pts_21plus_prop", "game_player_pts_22plus_prop", "game_player_ast_3plus_prop",
            "game_player_ast_5plus_prop", "game_player_ast_7plus_prop",
            "game_player_orb_1plus_prop", "game_player_orb_2plus_prop", "game_player_orb_3plus_prop",
            "game_player_drb_5plus_prop", "game_player_drb_7plus_prop", "game_player_drb_10plus_prop",
            "game_gs_mean_prop", "game_gs_max_prop", "game_pos_prop", "game_pts_prop", "game_efficiency_prop",
            "game_fg_attempted_prop", "game_ft_attempted_prop", "game_ft_made_prop", "game_stl_prop",
            "game_tov_prop","game_stl_tov_ratio_diff", "game_stl_tov_ratio_prop", "game_blk_prop",
            "game_orb_prop", "game_drb_prop", "game_trb_prop", "game_ast_prop", "game_pf_diff", "game_pf_prop",
            "home_indicator.x", "neutral_indicator"]

### Features data

In [6]:
data_all_features = data_all[features]
train_features = train[features]
test_features = test[features]

### Response variable

In [7]:
response = "win_indicator"

### Response data

In [8]:
data_all_response = data_all[response]
train_response = train[response]
test_response = test[response]

## Normalize data

In [9]:
data_all_features_normalized = normalize(data_all_features)
train_features_normalized = normalize(train_features)
test_features_normalized = normalize(test_features)

## Get best neural net hyperparameters from HW1

In [10]:
neural_network_results = pd.read_csv(directory_hw1 + "sensor_neural_net_grid_search.csv")
out_fold_cv_summary = neural_network_results.groupby(["hidden_layer_size", "solver", "activation", "alpha"])["out_fold_accuracy"].agg(["mean"]).sort_values(by = ["mean"], ascending = False)
optimal_results = out_fold_cv_summary.index[np.argmax(out_fold_cv_summary["mean"])]
hidden_layer_size_opt = optimal_results[0]
solver_opt = optimal_results[1]
activation_opt = optimal_results[2]
alpha_opt = optimal_results[3]

## Get ICA transformation

In [11]:
transformer = FastICA(n_components = len(features), max_iter = 2000, random_state = 28)
train_features_normalized_ica = transformer.fit_transform(train_features_normalized)
test_features_normalized_ica = transformer.fit_transform(test_features_normalized)

## Sort transformed variables by kurtosis

In [12]:
ica_kurtosis_train = pd.DataFrame({"kurtosis":kurtosis(train_features_normalized_ica)})
ica_kurtosis_train["var_index"] = ica_kurtosis_train.index
ica_kurtosis_train = ica_kurtosis_train.sort_values(by = "kurtosis", ascending = False)
ica_variables = list(ica_kurtosis_train["var_index"])

## Optimizing the number of components

In [13]:
#Initialize an empty data frame for recording results
neural_network_results = pd.DataFrame(columns = ["iter_counter",
                                                 "hidden_layer_size",
                                                 "solver",
                                                 "activation",
                                                 "alpha",
                                                 "train_accuracy",
                                                 "test_accuracy",
                                                 "time"
                                                 ])

#Set hyperparameter space
n_components = range(1, 43, 1)

#Start an iteration counter
iter_counter = 0

#For each alpha, hidden layer size, and fold...
#(using fixed solver and activiation function for now)
for n_component in n_components:
    
    #Increment the counter
    iter_counter = iter_counter + 1
    
    #Output message
    print("Iter:", iter_counter,
            "| number of components:", n_component)

    #Get the random projection
    start_time = time.time()
    
    #transformer = FastICA(n_components = n_component, max_iter = 2000, random_state = 28)
    #train_features_normalized_ica = transformer.fit_transform(train_features_normalized)
    #test_features_normalized_ica = transformer.fit_transform(test_features_normalized)
    
    train_features_normalized_ica_model = train_features_normalized_ica[:, ica_variables[0:n_component]]
    test_features_normalized_ica_model = test_features_normalized_ica[:, ica_variables[0:n_component]]

    #Get the neural net object
    neural_network = MLPClassifier(solver = solver_opt, #lbfgs, adam, sgd
                                   activation = activation_opt, #identity, logistic, tanh, relu
                                   alpha = alpha_opt,
                                   hidden_layer_sizes = (hidden_layer_size_opt,),
                                   batch_size = "auto",
                                   learning_rate = "constant",
                                   learning_rate_init = 0.001,
                                   power_t = 0.5,
                                   max_iter = 200,
                                   shuffle = True,
                                   random_state = 28,
                                   tol = 0.0001,
                                   verbose = False,
                                   warm_start = False,
                                   momentum = 0.9,
                                   nesterovs_momentum = True,
                                   early_stopping = True,
                                   validation_fraction = 0.1,
                                   beta_1 = 0.9,
                                   beta_2 = 0.999,
                                   epsilon = 1e-08,
                                   n_iter_no_change = 10,
                                   max_fun = 15000)
    
    #Fit the model
    neural_network.fit(train_features_normalized_ica_model, train_response) 
    
    #Evaluate the model
    #In fold
    train_accuracy = neural_network.score(train_features_normalized_ica_model, train_response)
    #Out of fold
    test_accuracy = neural_network.score(test_features_normalized_ica_model, test_response)

    end_time = time.time()
    total_time = end_time - start_time

    #Add to results list
    neural_network_results = neural_network_results.append({"iter_counter": iter_counter,
                                                            "n_components": n_component,
                                                            "hidden_layer_size": hidden_layer_size_opt,
                                                            "solver": solver_opt,
                                                            "activation": activation_opt,
                                                            "alpha": alpha_opt,
                                                            "train_accuracy": train_accuracy,
                                                            "test_accuracy": test_accuracy,
                                                            "time": total_time},
                                                            ignore_index = True)
    print(neural_network_results, "\n")
    neural_network_results.to_csv(directory_hw3 + "sensor_ica_neural_net_optimization.csv", index = False)

Iter: 1 | number of components: 1
  iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0            1              1000   adam       relu   0.02        0.507798   

   test_accuracy      time  n_components  
0       0.497382  5.700343           1.0   

Iter: 2 | number of components: 2
  iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0            1              1000   adam       relu   0.02        0.507798   
1            2              1000   adam       relu   0.02        0.503309   

   test_accuracy      time  n_components  
0       0.497382  5.700343           1.0  
1       0.494379  3.931698           2.0   

Iter: 3 | number of components: 3
  iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0            1              1000   adam       relu   0.02        0.507798   
1            2              1000   adam       relu   0.02        0.503309   
2            3              1000   adam       relu   0.02    

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   

    test_accuracy      time  n_components  
0        0.497382  

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      

   iter_counter hidden_layer_size solver activation  alpha  train_accuracy  \
0             1              1000   adam       relu   0.02        0.507798   
1             2              1000   adam       relu   0.02        0.503309   
2             3              1000   adam       relu   0.02        0.509699   
3             4              1000   adam       relu   0.02        0.515105   
4             5              1000   adam       relu   0.02        0.502818   
5             6              1000   adam       relu   0.02        0.502261   
6             7              1000   adam       relu   0.02        0.519037   
7             8              1000   adam       relu   0.02        0.546166   
8             9              1000   adam       relu   0.02        0.544397   
9            10              1000   adam       relu   0.02        0.544528   
10           11              1000   adam       relu   0.02        0.548231   
11           12              1000   adam       relu   0.02      