In [31]:
import random
import sklearn
import warnings
import os
import numpy as np
import pandas as pd 
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn import linear_model
from sklearn.metrics import make_scorer
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn import svm
from sklearn.metrics import mean_squared_error as mse
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import IsolationForest
import matplotlib.pyplot as plt
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import ElasticNet, SGDRegressor
from sklearn.feature_selection import SelectKBest, chi2, f_regression, mutual_info_regression, RFE
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from scipy import stats
warnings.filterwarnings('ignore')

In [4]:
def make_submission(prediction, filename):
    assert type(prediction) == np.ndarray, "Pass a numpy.ndarray"
    assert prediction.shape[0] == 80, f"Missing data points Expected 80 Got {prediction.shape[0]}"
    assert prediction.shape[1] == 595, f"Number of features predicted is not correct Excpected 595 Got {prediction.shape[1]}"
    assert type(filename) == str, "Filename must be a string"
    assert '.csv' in filename, "Add extension '.csv'"
    
    df = pd.DataFrame(prediction.flatten())
    df.index.name = "ID"
    df = df.rename(columns={0:"predicted"})
    df.to_csv(filename)
    print(f"{filename} saved at {os.getcwd()}")
    return df

In [5]:
def remove_id_column(dataframe):
    assert type(dataframe) == pd.core.frame.DataFrame, f"{type(dataframe)} != pd.core.frame.DataFrame"
    return dataframe.drop("ID", 1)


In [20]:
def try_models(train,train_labels,test,test_labels, results):

    def test_model(clf):

        clf.fit(train, train_labels)
        test_predict = clf.predict(test)
        actual = test_labels.flatten()
        predicted = test_predict.flatten()
        scores = mse(predicted, actual)

        return scores

    clf = linear_model.LinearRegression()
    test_result = test_model(clf)
    model_name = 'LinearRegression'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})
    
    clf = linear_model.Ridge()
    test_result = test_model(clf)
    model_name = 'Ridge'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})

    clf = MultiOutputRegressor(SGDRegressor())
    test_result = test_model(clf)
    model_name = 'SGDRegressor'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})

    clf = MultiOutputRegressor(linear_model.BayesianRidge())
    test_result = test_model(clf)
    model_name = 'BayesianRidge'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})
    
    clf = MultiOutputRegressor(linear_model.HuberRegressor())
    test_result = test_model(clf)
    model_name = 'HuberRegressor'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})
    
    clf = linear_model.Lasso(alpha=1e-4)
    test_result = test_model(clf)
    model_name = 'Lasso'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})
        
    clf = BaggingRegressor()
    test_result = test_model(clf)
    model_name = 'BaggingRegressor'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})

    clf = ElasticNet()
    test_result = test_model(clf)
    model_name = 'ElasticNet'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})
    
    clf = RandomForestRegressor()
    test_result = test_model(clf)
    model_name = 'RandomForestRegressor'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})

    clf = MultiOutputRegressor(AdaBoostRegressor())
    test_result = test_model(clf)
    model_name = 'AdaBoostRegressor'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})
    
    clf = MultiOutputRegressor(svm.SVR(kernel="linear"))
    test_result = test_model(clf)
    model_name = 'SVR(kernel="linear")'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})
       
    clf = MultiOutputRegressor(svm.SVR(kernel="rbf"))
    test_result = test_model(clf)
    model_name = 'SVR(kernel="rbf")'
    if model_name in results.keys():
        results[model_name].append(test_result)
    else:
        results.update({model_name:[test_result]})
    
    return results

In [7]:
random.seed(1)
def cv(data, labels, n_fold, model=None, shuffle=False):
    kf = KFold(n_splits=n_fold, shuffle=shuffle)
    results = {}
    if model is not None:
        results['YourModel'] = []
    for train_index, test_index in kf.split(data):
        train = data[train_index, :]
        train_labels = labels[train_index, :]
        test = data[test_index, :]
        test_labels = labels[test_index, :]

        scaler = MinMaxScaler()
        # Fit on training set only.
        scaler.fit(train)
        # Apply transform to both the training set and the test set.
        train = scaler.transform(train)
        test = scaler.transform(test)

        pca = PCA(0.70)
        pca.fit(train)
        train = pca.transform(train)
        test = pca.transform(test)

        if model == None:
            results = try_models(train,train_labels,test,test_labels, results)
        else:
            model.fit(train, train_labels)
            test_predict = model.predict(test)
            actual = test_labels.flatten()
            predicted = test_predict.flatten()
            scores = mean_squared_error(predicted, actual)
            results['YourModel'].append(scores)

    # Add Variance and Mean Values as columns  
    # to the results dataframe
    results_df = pd.DataFrame(results).T
    variance = results_df.var(axis=1)
    mean = results_df.mean(axis=1)
    results_df.insert(5, "Var", variance)
    results_df.insert(6, "Mean", mean)  

    return results_df

In [21]:
def train_model(model, data, labels):
    scaler = MinMaxScaler()
    # Fit on training set only.
    scaler.fit(data)
    # Apply transform to both the training set and the test set.
    data = scaler.transform(data)

    pca = PCA(0.70)
    pca.fit(data)
    train = pca.transform(data)
    print(train.shape)
    model.fit(data, labels)
    return model

In [22]:
X = pd.read_csv('train_t0.csv')
Y = pd.read_csv('train_t1.csv')
test = pd.read_csv('test_t0.csv')
X = remove_id_column(X)
Y = remove_id_column(Y)
test = remove_id_column(test)

In [23]:
data = X.to_numpy()
labels = Y.to_numpy()

diff = labels - data
diff_norm = sklearn.preprocessing.normalize(diff, axis=0)

Cleaning with z score

In [24]:
z = np.abs(stats.zscore(diff, 1))
thresh = 4
data_clean = data[(z<thresh).all(axis=1)]
data_clean.shape
labels_clean = data[(z<thresh).all(axis=1)]
labels_clean.shape
z_results = cv(data_clean, labels_clean, 5, shuffle=True)
z_results.head(20)

Unnamed: 0,0,1,2,3,4,Var,Mean
LinearRegression,0.001109,0.001039,0.002377,0.000904,0.0012,3.568823e-07,0.001326
Ridge,0.001108,0.001039,0.002376,0.000907,0.001202,3.55822e-07,0.001326
SGDRegressor,0.001511,0.001548,0.002905,0.001383,0.001616,3.937807e-07,0.001793
BayesianRidge,0.001109,0.001038,0.002384,0.000912,0.001203,3.589423e-07,0.001329
HuberRegressor,0.001111,0.001043,0.002418,0.000914,0.001218,3.749088e-07,0.001341
Lasso,0.001108,0.001038,0.002377,0.000905,0.0012,3.573083e-07,0.001326
BaggingRegressor,0.001791,0.00171,0.004134,0.001707,0.00209,1.09178e-06,0.002287
ElasticNet,0.002195,0.002025,0.004866,0.002206,0.002551,1.411328e-06,0.002768
RandomForestRegressor,0.001644,0.001513,0.004024,0.0015,0.00197,1.156781e-06,0.00213
AdaBoostRegressor,0.001506,0.001418,0.00355,0.001347,0.001755,8.590994e-07,0.001915


In [38]:
z_results.to_latex()

'\\begin{tabular}{lrrrrrrr}\n\\toprule\n{} &         0 &         1 &         2 &         3 &         4 &           Var &      Mean \\\\\n\\midrule\nLinearRegression      &  0.001109 &  0.001039 &  0.002377 &  0.000904 &  0.001200 &  3.568823e-07 &  0.001326 \\\\\nRidge                 &  0.001108 &  0.001039 &  0.002376 &  0.000907 &  0.001202 &  3.558220e-07 &  0.001326 \\\\\nSGDRegressor          &  0.001511 &  0.001548 &  0.002905 &  0.001383 &  0.001616 &  3.937807e-07 &  0.001793 \\\\\nBayesianRidge         &  0.001109 &  0.001038 &  0.002384 &  0.000912 &  0.001203 &  3.589423e-07 &  0.001329 \\\\\nHuberRegressor        &  0.001111 &  0.001043 &  0.002418 &  0.000914 &  0.001218 &  3.749088e-07 &  0.001341 \\\\\nLasso                 &  0.001108 &  0.001038 &  0.002377 &  0.000905 &  0.001200 &  3.573083e-07 &  0.001326 \\\\\nBaggingRegressor      &  0.001791 &  0.001710 &  0.004134 &  0.001707 &  0.002090 &  1.091780e-06 &  0.002287 \\\\\nElasticNet            &  0.002195 &  0.0

In [29]:
z = np.abs(stats.zscore(diff, 1))
thresh = 4
data_clean = data[(z<thresh).all(axis=1)]
data_clean.shape
labels_clean = data[(z<thresh).all(axis=1)]
labels_clean.shape

clf = linear_model.Ridge()
clf.fit(data_clean, labels_clean)
test = pd.read_csv('test_t0.csv')
test = remove_id_column(test)
predict = clf.predict(test)

In [30]:
predict = make_submission(predict, 'ridge.csv')

ridge.csv saved at /content


In [34]:
clf = IsolationForest(random_state=0).fit(diff)
k1 = clf.predict(diff)
data_clean = data[np.where(k1==1)]
clf2 = IsolationForest(random_state=0).fit(data)
k2 = clf2.predict(data)
clf3 = IsolationForest(random_state=0).fit(labels)
k3 = clf3.predict(labels)
anomalies = np.logical_and(k3==1, np.logical_and(k2==1, k1==1))
locations = np.where(anomalies)

In [35]:
data_clean = data[locations]
labels_clean = labels[locations]
print(data_clean.shape)
print(labels_clean.shape)

(139, 595)
(139, 595)


In [36]:
results = cv(data_clean, labels_clean, 5, shuffle=True)

In [37]:
results.head(20)

Unnamed: 0,0,1,2,3,4,Var,Mean
LinearRegression,0.002066,0.002234,0.00247,0.002061,0.002375,3.341324e-08,0.002241
Ridge,0.002062,0.002227,0.002465,0.002056,0.002367,3.31714e-08,0.002235
SGDRegressor,0.002109,0.002262,0.002614,0.002159,0.002371,4.029907e-08,0.002303
BayesianRidge,0.002021,0.002105,0.002444,0.001979,0.002226,3.499028e-08,0.002155
HuberRegressor,0.002096,0.002215,0.002489,0.002085,0.002342,2.940294e-08,0.002245
Lasso,0.00206,0.002226,0.002464,0.002055,0.002367,3.32624e-08,0.002235
BaggingRegressor,0.002358,0.00231,0.002808,0.002327,0.002567,4.549957e-08,0.002474
ElasticNet,0.002545,0.002466,0.002967,0.002401,0.002479,5.139055e-08,0.002572
RandomForestRegressor,0.00217,0.002188,0.002637,0.002185,0.002282,3.910649e-08,0.002293
AdaBoostRegressor,0.002237,0.002255,0.002664,0.002225,0.002431,3.540504e-08,0.002362


In [40]:
results.to_latex()

'\\begin{tabular}{lrrrrrrr}\n\\toprule\n{} &         0 &         1 &         2 &         3 &         4 &           Var &      Mean \\\\\n\\midrule\nLinearRegression      &  0.002066 &  0.002234 &  0.002470 &  0.002061 &  0.002375 &  3.341324e-08 &  0.002241 \\\\\nRidge                 &  0.002062 &  0.002227 &  0.002465 &  0.002056 &  0.002367 &  3.317140e-08 &  0.002235 \\\\\nSGDRegressor          &  0.002109 &  0.002262 &  0.002614 &  0.002159 &  0.002371 &  4.029907e-08 &  0.002303 \\\\\nBayesianRidge         &  0.002021 &  0.002105 &  0.002444 &  0.001979 &  0.002226 &  3.499028e-08 &  0.002155 \\\\\nHuberRegressor        &  0.002096 &  0.002215 &  0.002489 &  0.002085 &  0.002342 &  2.940294e-08 &  0.002245 \\\\\nLasso                 &  0.002060 &  0.002226 &  0.002464 &  0.002055 &  0.002367 &  3.326240e-08 &  0.002235 \\\\\nBaggingRegressor      &  0.002358 &  0.002310 &  0.002808 &  0.002327 &  0.002567 &  4.549957e-08 &  0.002474 \\\\\nElasticNet            &  0.002545 &  0.0