In [28]:
import numpy as np
import pandas as pd

import sklearn
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVR

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.feature_selection import RFECV
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.decomposition import PCA

In [29]:
X_train_with_id = pd.read_csv('X_train.csv').values
Y_train_with_id = pd.read_csv('y_train.csv').values

In [30]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_train_with_id, Y_train_with_id, test_size = 0.2)

In [31]:
X_train_missing_values = X_train_1[:,1:]
Y_train = y_train_1[:,1]
Y_test = y_test_1[:,1]

In [5]:
imr = SimpleImputer(missing_values = np.nan, strategy = 'median')
imr = imr.fit(X_train_missing_values)
X_train_temp_imputed = imr.transform(X_train_missing_values)

In [6]:
#HYPERPARAMETERS

OUTLIER_PERCENTILE = 3

# Removing Outliers

In [7]:
maximum_values = np.percentile(X_train_temp_imputed, 100 - OUTLIER_PERCENTILE, axis=0)
minimum_values = np.percentile(X_train_temp_imputed, OUTLIER_PERCENTILE, axis=0)

In [8]:
X_train_outliers_removed = np.ndarray(X_train_temp_imputed.shape)
for i in range(X_train_temp_imputed.shape[1]):
    indices = np.nonzero(X_train_temp_imputed[:,i] > maximum_values[i])
    X_train_outliers_removed[indices,i] = np.nan 
    indices = np.nonzero(X_train_temp_imputed[:,i] < minimum_values[i])
    X_train_outliers_removed[indices,i] = np.nan 
    #X_train_outliers_removed[:,i] = np.clip(X_train_temp_imputed[:,i],minimum_values[i],maximum_values[i])

In [34]:
print(X_train_outliers_removed[:5,:5])

[[3.705e-321 3.542e-321 3.508e-321 1.976e-323 1.734e-321]
 [5.583e-322 4.456e-321 2.544e-321 4.343e-321 4.150e-321]
 [1.877e-322 3.948e-321 1.808e-321 1.927e-321 4.274e-321]
 [1.784e-321 2.579e-321 3.187e-321 3.607e-322 3.562e-321]
 [3.078e-321 3.617e-321 2.930e-321 2.861e-321 2.955e-321]]


In [9]:
print(np.count_nonzero(np.isnan(X_train_outliers_removed)))
print(X_train_outliers_removed.shape[0]*X_train_outliers_removed.shape[1])

49664
806208


In [10]:
X_train_outliers_removed = np.ndarray(X_train_temp_imputed.shape)
for i in range(X_train_missing_values.shape[0]):
    for j in range(X_train_missing_values.shape[1]):
        if np.isnan(X_train_missing_values[i][j]):
            X_train_outliers_removed[i][j] = np.nan

# Imputing missing values

In [11]:
imputer = KNNImputer(n_neighbors=2)
imputer.fit(X_train_outliers_removed)
X_train = imputer.transform(X_train_outliers_removed)

In [27]:
X_train_outliers_removed[:5,:5]

array([[3.705e-321, 3.542e-321, 3.508e-321, 1.976e-323, 1.734e-321],
       [5.583e-322, 4.456e-321, 2.544e-321, 4.343e-321, 4.150e-321],
       [1.877e-322, 3.948e-321, 1.808e-321, 1.927e-321, 4.274e-321],
       [1.784e-321, 2.579e-321, 3.187e-321, 3.607e-322, 3.562e-321],
       [3.078e-321, 3.617e-321, 2.930e-321, 2.861e-321, 2.955e-321]])

# Feature Selection

In [13]:
pca = PCA()
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_train_pca = pca.fit_transform(X_train_std)

  explained_variance_ratio_ = explained_variance_ / total_var


In [17]:
X_train_clean = X_train_pca[:,:350]

In [18]:
print(X_train_clean.shape)
print(Y_train.shape)

(969, 350)
(969,)


In [19]:
estimator = RandomForestRegressor(n_estimators=250, max_depth = 15,random_state = 1)
selector = RFECV(estimator, cv=3, min_features_to_select = 150, step = 0.15, n_jobs = 4, verbose = 1)
selector = selector.fit(X_train_clean, Y_train)

Fitting estimator with 350 features.
Fitting estimator with 298 features.
Fitting estimator with 246 features.
Fitting estimator with 194 features.


In [20]:
pred = selector.predict(X_train_clean)

In [21]:
print(r2_score(Y_train, pred))

-3.363630883601587e-06


In [22]:
id = X_test_1[:,0]
X_test_missing_values = X_test_1[:,1:]
X_test_temp_imputed = imr.transform(X_test_missing_values)
X_test_outliers_removed = np.ndarray(X_test_temp_imputed.shape)
for i in range(X_train_temp_imputed.shape[1]):
    X_test_outliers_removed[:,i] = np.clip(X_test_temp_imputed[:,i],minimum_values[i],maximum_values[i])
X_test = imputer.transform(X_test_outliers_removed)
X_test_std = sc.transform(X_test)
X_test_pca = pca.transform(X_test_std)
X_test_clean = X_test_pca[:,:350]
test_pred = selector.predict(X_test_clean)
print(r2_score(Y_test,test_pred))

-9.769473156828745e-05


# Model Training

In [17]:
X_test_with_id = pd.read_csv('X_test.csv').values

In [18]:
id = X_test_with_id[:,0]
X_test_missing_values = X_test_with_id[:,1:]
X_test_temp_imputed = imr.transform(X_test_missing_values)
X_test_outliers_removed = np.ndarray(X_test_temp_imputed.shape)
for i in range(X_train_temp_imputed.shape[1]):
    X_test_outliers_removed[:,i] = np.clip(X_test_temp_imputed[:,i],minimum_values[i],maximum_values[i])
X_test = imputer.transform(X_test_outliers_removed)

test_pred = selector.predict(X_test)

In [19]:
output = np.ndarray([776,2])

In [20]:
output = pd.DataFrame({'id': id, 'y':test_pred})

In [21]:
print(output.shape)

(776, 2)


In [22]:
output.to_csv("RandomForestTry5.csv")