In [243]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [244]:
# Import the train and test data
df_X_train = pd.read_csv('X_train.csv').iloc[:, 1:]
df_y_train = pd.read_csv('y_train.csv').iloc[:, 1:]
df_X_test = pd.read_csv('X_test.csv')

X_train = df_X_train.to_numpy()
y_train = df_y_train.to_numpy()
X_test = df_X_test.to_numpy()
id_test = X_test[:, 0]
X_test = X_test[:, 1:]

print(X_train.shape, y_train.shape,X_test.shape)

(1212, 832) (1212, 1) (776, 832)


In [245]:

# Imput missing values using the mean of each column (basic : try to find more pertinent)

# imput missing values using the k-neighbors imputer (more advanced)
from sklearn.impute import KNNImputer   

# Create the imputer object, with 50 neighbors
imputer = KNNImputer(n_neighbors=10, weights='distance')

# Fit the imputer object on the train data
imputer.fit(X_train)

# Impute the missing values on the train and test data
X_train = imputer.transform(X_train)
X_test = imputer.transform(X_test)

# Check that there is no more missing values    
print(np.isnan(X_train).sum(), np.isnan(X_test).sum())
print(X_train.shape, y_train.shape, X_test.shape)


0 0
(1212, 832) (1212, 1) (776, 832)


In [246]:
# Remove features with low variance
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=0.1)  # remove features with more than 80% variance
X_train = sel.fit_transform(X_train)
X_test = sel.transform(X_test)
print(X_train.shape, y_train.shape, X_test.shape)

(1212, 666) (1212, 1) (776, 666)


In [247]:
### Drop highly correlated features
import pandas as pd
import numpy as np

# Assuming that X_train is your ndarray and it only contains feature columns
df = pd.DataFrame(X_train)
correlation_matrix = df.corr()

# Create a set to hold the correlated columns
corr_columns = set()

# Iterate over the correlation matrix
for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        # If the correlation between the columns is high, add it to the set
        if abs(correlation_matrix.iloc[i, j]) > 0.8:
            colname = correlation_matrix.columns[i]
            corr_columns.add(colname)

# Get the indices of the relevant features
relevant_features = [df.columns.get_loc(c) for c in df.columns if c not in corr_columns]

X_train = X_train[:, relevant_features]
X_test = X_test[:, relevant_features]
# Print the relevant feature indices


In [248]:
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import chi2
# Select the most relevant features
from sklearn.feature_selection import SelectKBest, f_classif

# Create the SelectKBest with the mutual info strategy
selector = SelectKBest(f_regression, k=100)

# Fit the object to the training data
selector.fit(X_train, y_train)

# Transform the data
X_train = selector.transform(X_train)
X_test = selector.transform(X_test)

  y = column_or_1d(y, warn=True)


In [249]:
import numpy as np
from sklearn.ensemble import IsolationForest

# Define the model
clf = IsolationForest(max_samples=100, random_state=42, contamination='auto')

# Fit the model
clf.fit(X_train)

# Predict the anomalies in the data
outliers = clf.predict(X_train)

# Find the location of anomalies
outlier_index = np.where(outliers == -1)

# Remove outliers from X_train
X_train = np.delete(X_train, outlier_index, axis=0)

# Remove corresponding outliers from y_train
y_train = np.delete(y_train, outlier_index, axis=0)

# Print the shapes of the updated X_train and y_train
print(X_train.shape, y_train.shape)


(1125, 100) (1125, 1)


In [250]:
#Split the data into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_val.shape, y_test.shape, X_test.shape)

(900, 100) (900, 1) (225, 100) (225, 1) (776, 100)


In [251]:
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Create the ElasticNet model
# model = ElasticNet(alpha=5, l1_ratio=1, random_state=0)
# model = DecisionTreeRegressor(max_depth=3, min_samples_split=2, min_samples_leaf=1, random_state=0)
model = RandomForestRegressor(max_depth=10, min_samples_split=2, min_samples_leaf=1, n_estimators=500, n_jobs=-1, random_state=42)
# Fit the model to the training data

model.fit(X_train, y_train)

# Predict the test data
y_pred = model.predict(X_test)

pred_test = model.predict(X_val)
training_test = model.predict(X_train)

train_sc = r2_score(y_train, training_test)
val_sc = r2_score(y_test, pred_test)
train_sc, val_sc

  model.fit(X_train, y_train)


(0.9288495188487036, 0.49874162367804675)

In [252]:
# Use the model to predict the test data and output it to a file "out.csv"
y_out = model.predict(X_test)
output = np.stack((id_test, y_out.flatten()), axis=-1)
df_out = pd.DataFrame(output, columns=["id", "y"])

df_out.to_csv("out.csv", index=False)