In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, FunctionTransformer
from sklearn.compose import make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import mean_squared_error
import random
import gc

In [12]:

#train_data = pd.read_csv(r'C:\Users\lscon\Desktop\AA\projeto\the-three-body-problem\mlNOVA\X_train.csv')
train_data = pd.read_csv(r'C:/Users/duart/OneDrive/Ambiente_de_Trabalho/Master_Analysis_Engineering_Big_Data/23-24/1st_semester/AA_ML/Kaggle_challenges/3_body_problem/3_body_problem/X_train.csv')


print(train_data.shape)


(1285000, 14)


In [3]:
# Identify faulty rows based on the criterion (all values = 0.0 except for Id)
zero_rows = train_data[(train_data.drop('Id', axis=1) == 0).all(axis=1)]

# Remove the faulty rows from the DataFrame
train_data_preprocessed = train_data[~train_data.index.isin(zero_rows.index)]
train_data_preprocessed.reset_index(drop=True, inplace=True)
#train_data_preprocessed.to_csv('train_preprocessed.csv', index=False)

np.linalg.matrix_rank(train_data_preprocessed)

10

In [None]:
#gives the stats for the preprocessed data (without the rows with zeros)
summary_stats_filtered = train_data_preprocessed.describe(include='all')

#gives the stats for the nonprocessed data
summary_stats = train_data.describe(include="all")

# Test...

In [None]:
# Calculate the correlation matrix and plot it as a heatmap
corr_matrix = train_data_preprocessed.drop(train_data_preprocessed.columns[13],
                                       axis=1).corr()
corr_matrix.to_excel('corr_matrix_train_processed.xlsx')

plt.figure(figsize=(20, 16), dpi=800)
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.4f',
            linewidths=0.5)
plt.title('Correlation Matrix Heatmap')
plt.savefig('Corr_matrix_heatmap.jpg', dpi=800)
plt.show()

# Create a pairwise scatter plot matrix
scatter_matrix = pd.plotting.scatter_matrix(train_data_preprocessed,
                                            figsize=(20, 20))
plt.show()


In [None]:
#create pairwise plots of correlation between variables

rows = 50000
x1_rows = x1.head(n=rows)
partial_train_data = train_data_preprocessed.drop(columns=['v_x_1','v_x_2','v_y_1', 
                                                           'v_y_2', 'v_x_3', 'v_y_3']).head(n=rows)


_= sns.pairplot(partial_train_data, kind="reg", diag_kind="kde", plot_kws={'line_kws':{'color':'red'}})
plt.title('Pairwise plots t vs velocity components')
plt.savefig('pairwisetv_50000')

In [4]:
#create feature label matrices
#we're not going to use the velocity components as features
train_data_without_velocity = train_data_preprocessed.drop(columns=['Id','v_x_1','v_x_2','v_y_1', 
                                                           'v_y_2', 'v_x_3', 'v_y_3'])
#divide by simulations
list_of_times = [values for values in train_data_without_velocity['t']]
time_index_tuples = list(enumerate(list_of_times))
zeros_indexes = list(filter(lambda value: value[1] == 0, time_index_tuples))
zeros_indexes = [value[0] for value in zeros_indexes] 
list_of_simulations = []
lower_bound = 0
for i in range(len(zeros_indexes)-1):
    simulation = train_data_without_velocity.iloc[lower_bound:zeros_indexes[i+1]]
    list_of_simulations.append(simulation)
    lower_bound = zeros_indexes[i+1]
from tqdm import tqdm

#add label and put the starting position at every row
#x1
for simulation in tqdm(list_of_simulations):
    first_row_values = simulation.head(1)
    simulation.loc[:,'x_1_label'] = simulation.loc[:, 'x_1']
    simulation.loc[:,'y_1_label'] = simulation.loc[:, 'y_1']
    simulation.loc[:,'x_2_label'] = simulation.loc[:, 'x_2']
    simulation.loc[:,'y_2_label'] = simulation.loc[:, 'y_2']
    simulation.loc[:,'x_3_label'] = simulation.loc[:, 'x_3']
    simulation.loc[:,'y_3_label'] = simulation.loc[:, 'y_3']
    for index, row in simulation.iterrows():
        simulation.at[index, 'x_1'] = first_row_values['x_1']
        simulation.at[index, 'y_1'] = first_row_values['y_1']
        simulation.at[index, 'x_2'] = first_row_values['x_2']
        simulation.at[index, 'y_2'] = first_row_values['y_2']
        simulation.at[index, 'x_3'] = first_row_values['x_3']
        simulation.at[index, 'y_3'] = first_row_values['y_3']

list_of_simulations_copy = list_of_simulations.copy()
random.shuffle(list_of_simulations_copy) #we shuffle the data here so we are only shuffling different simulations and not amongst them
all_simulations = pd.concat(list_of_simulations_copy, ignore_index=True)
all_simulations.to_csv('feature_matrix.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
100%|██████████| 4997/4997 [02:33<00:00, 32.62it/s]


In [2]:
#load the feature matrix file
total_data = np.genfromtxt('feature_matrix.csv', delimiter=',')
total_data = total_data[1:] 


In [11]:
#splitting the dataset into training and validation
sample, lixo = train_test_split(total_data, train_size=0.1, shuffle=False)
data_train, data_temp = train_test_split(sample, train_size=0.5, shuffle=False)
data_vali, data_test = train_test_split(data_temp, test_size=0.5, shuffle=False)


In [12]:
#split the labels from the features
features_train = data_train[:, :7]  
labels_train = data_train[:, 7:] 

features_vali = data_vali[:, :7]  
labels_vali = data_vali[:, 7:] 

features_test = data_test[:, :7]  
labels_test = data_test[:, 7:]

In [14]:
def accel_proxy(X):
    c1 = X[:, [1,2]]
    c2 = X[:, [3,4]]
    c3 = X[:, [5,6]]

    def _acc(c1, c2):
        c = c1-c2
        return c/(np.linalg.norm(c)**3)

    a1 = _acc(c1, c2) + _acc(c1, c3)
    a2 = _acc(c2, c1) + _acc(c2, c3)
    a3 = _acc(c3, c1) + _acc(c3, c2)

    X = np.hstack((X, a1, a2, a3))
    return X

accel_transformer = FunctionTransformer(func=accel_proxy)


In [9]:
# features_test = accel_proxy(features_test)
# features_vali = accel_proxy(features_vali)
# features_train = accel_proxy(features_train)


In [None]:
# trainaccel = pd.DataFrame(data=features_train, columns=['t', 'x1', 'y1', 'x2', 'y2', 'x3', 'y3', 'a1x', 'a1y', 'a2x', 'a2y', 'a3x', 'a3y'])
# trainaccel.to_csv('trainwithaccel.csv', index=False)

In [None]:
# print(features_train)

In [None]:
#plot the points (isto e porque acho que era interessante termos no deck of slides)

In [7]:
# entretanto descobri que o sklearn tem uma crossvalidation feature que e capaz de ser bem util no calculo do MSE (literalmente transformar
# aquilo em duas linhas o que e fixolas)
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html#sklearn.model_selection.cross_val_score


In [None]:
#depois no final grafico para o melhor claro e calculamos sqrt(mse) para vermos o quao off estamos - test set
#depois seria implementar isto tudo para as outras matrizes que faltam
#sugeria depois transformar isto do modelo numa funcao para ser mais simples e nao repetirmos codigo
# o mesmo poderia ser feito para a criaçao das matrizes

# Model with Ridge Regression

In [182]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge

# Create a pipeline object
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('polynomial_features', PolynomialFeatures(7)),
    ('regressor', Ridge(0.9))
])

pipeline.fit(features_train,labels_train)

# Make predictions on the validation data using the best model
labels_pred_vali = pipeline.predict(features_vali)

# Evaluate the performance of the best model on the validation data
mse_vali = mean_squared_error(labels_vali, labels_pred_vali, squared=False)
print(f"RMSE Vali:\t{mse_vali}")

# Make predictions on the test data using the best model
labels_pred_test = pipeline.predict(features_test)

# Evaluate the performance of the best model on the test data
mse_test = mean_squared_error(labels_test, labels_pred_test, squared=False)
print(f"RMSE Test:\t{mse_test}")


RMSE Vali:	1.1130009803004723
RMSE Test:	1.7359826264652185


# Learning Feature Engineering - Model w/accelerations in the pipeline and Ridge Regression

In [15]:
model = Pipeline([
    ('acceleration_transformer', accel_transformer),
    ('column_droper', ColumnTransformer([("stand_drop", StandardScaler(), [0]+list(range(3,13)))])),
    ('polynomial_features', PolynomialFeatures(5)),
    ('regressor', LinearRegression())
])

# Fit the pipeline using features_train and labels_train
model.fit(features_train, labels_train)

labels_pred_train = model.predict(features_train)

mse_train = mean_squared_error(labels_train, labels_pred_train, squared=False)
print(f"RMSE Train:\t{mse_train}")

# Make predictions on the validation data using the best model
labels_pred_vali = model.predict(features_vali)

# Evaluate the performance of the best model on the validation data
mse_vali = mean_squared_error(labels_vali, labels_pred_vali, squared=False)
print(f"RMSE Vali:\t{mse_vali}")

# Make predictions on the test data using the best model
labels_pred_test = model.predict(features_test)

# Evaluate the performance of the best model on the test data
mse_test = mean_squared_error(labels_test, labels_pred_test, squared=False)
print(f"RMSE Test:\t{mse_test}")


RMSE Train:	1.2603472192997267
RMSE Vali:	4.314207075635748e+19
RMSE Test:	4.432581527546607e+19


### REAL WORLD DATA PREDICTIONS

In [None]:
X_realworld = pd.read_csv(r'C:/Users/duart/OneDrive/Ambiente_de_Trabalho/Master_Analysis_Engineering_Big_Data/23-24/1st_semester/AA_ML/Kaggle_challenges/3_body_problem/3_body_problem/X_test.csv')
id_column = X_realworld['Id']
print(id_column.head())
X_realworld.drop('Id', axis=1, inplace=True)
X_realworld.to_csv('X_realworld.csv', index=False)

In [None]:
# Make predictions on the preprocessed real-world data
X_realworld = pd.read_csv('X_realworld.csv')
predictions_realworld = model.predict(X_realworld)


# Create a new Pandas DataFrame with the predictions
df_predictions = pd.DataFrame(predictions_realworld)
df_predictions.insert(loc=0, column='Id', value = id_column)
df_predictions.columns=['Id', 'x_1', 'y_1', 'x_2', 'y_2', 'x_3', 'y_3']

# Submit the Pandas DataFrame to the challenge creator
df_predictions.to_csv('predictions.csv', index=False)