In [None]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
sub = pd.read_csv("../input/sample_submission.csv")
structure = pd.read_csv("../input/structures.csv")

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
structure.head(3)

In [None]:
train_atom0 = pd.merge(
    train, 
    structure,
    left_on=["molecule_name", "atom_index_0"],
    right_on=["molecule_name", "atom_index"],
    how="left"
)

train_atom0.head(3)

In [None]:
train_atom0 = train_atom0.drop("atom_index", axis=1)
train_atom0 = train_atom0.rename(
    columns={
        "atom": "atom_0",
        "x": "x_0",
        "y": "y_0",
        "z": "z_0",
    }
)

train_atom0.head(3)

In [None]:
train_atom0_atom1 = pd.merge(
    train_atom0, 
    structure,
    left_on=["molecule_name", "atom_index_1"],
    right_on=["molecule_name", "atom_index"],
    how="left"
)

train_atom0_atom1.head(3)

In [None]:
train_atom0_atom1 = train_atom0_atom1.drop("atom_index", axis=1)
train_atom0_atom1 = train_atom0_atom1.rename(
    columns={
        "atom": "atom_1",
        "x": "x_1",
        "y": "y_1",
        "z": "z_1",
    }
)

train_atom0_atom1.head(3)

In [None]:
train = train_atom0_atom1

In [None]:
test_atom0 = pd.merge(
    test, 
    structure,
    left_on=["molecule_name", "atom_index_0"],
    right_on=["molecule_name", "atom_index"],
    how="left"
)

test_atom0.head(3)

In [None]:
test_atom0 = test_atom0.drop("atom_index", axis=1)
test_atom0 = test_atom0.rename(
    columns={
        "atom": "atom_0",
        "x": "x_0",
        "y": "y_0",
        "z": "z_0",
    }
)

test_atom0.head(3)

In [None]:
test_atom0_atom1 = pd.merge(
    test_atom0, 
    structure,
    left_on=["molecule_name", "atom_index_1"],
    right_on=["molecule_name", "atom_index"],
    how="left"
)

test_atom0_atom1.head(3)

In [None]:
test_atom0_atom1 = test_atom0_atom1.drop("atom_index", axis=1)
test_atom0_atom1 = test_atom0_atom1.rename(
    columns={
        "atom": "atom_1",
        "x": "x_1",
        "y": "y_1",
        "z": "z_1",
    }
)

test_atom0_atom1.head(3)

In [None]:
test = test_atom0_atom1

In [None]:
import numpy as np

train_p_0 = train[["x_0", "y_0", "z_0"]].values
train_p_1 = train[["x_1", "y_1", "z_1"]].values
test_p_0 = test[["x_0", "y_0", "z_0"]].values
test_p_1 = test[["x_1", "y_1", "z_1"]].values

train["dist"] = np.linalg.norm(train_p_0 - train_p_1, axis=1)
train["dist_x"] = np.square(train["x_0"] - train["x_1"])
train["dist_y"] = np.square(train["y_0"] - train["y_1"])
train["dist_z"] = np.square(train["z_0"] - train["z_1"])

test["dist"] = np.linalg.norm(test_p_0 - test_p_1, axis=1)
test["dist_x"] = np.square(test["x_0"] - test["x_1"])
test["dist_y"] = np.square(test["y_0"] - test["y_1"])
test["dist_z"] = np.square(test["z_0"] - test["z_1"])


In [None]:
train.head(3)

In [None]:
train["type_0"] = train["type"].apply(lambda x: x[0])
test["type_0"] = test["type"].apply(lambda x: x[0])

In [None]:
from tqdm import tqdm_notebook

def create_features(df):
    df["molecule_couples"] = df.groupby("molecule_name")["id"].transform("count")
    
    df["molecule_dist_mean"] = df.groupby("molecule_name")["dist"].transform("mean")
    df["molecule_dist_min"] = df.groupby("molecule_name")["dist"].transform("min")
    df["molecule_dist_max"] = df.groupby("molecule_name")["dist"].transform("max")
    df["molecule_dist_std"] = df.groupby("molecule_name")["dist"].transform("std")
    
    df["atom_0_couples"] = df.groupby(["molecule_name", "atom_index_0"])["id"].transform("count")
    df["atom_1_couples"] = df.groupby(["molecule_name", "atom_index_1"])["id"].transform("count")
    
    categorical_columns = ["atom_index_0", "atom_index_1", "type", "type_0", "atom_0", "atom_1"]
    numerical_columns = ["x_0", "y_0", "z_0", "x_1", "y_1", "z_1", "dist", "dist_x", "dist_y", "dist_z"]
    stat_values = ["mean", "min", "max", "std"]
    
    for column in categorical_columns:
        df[f"molecule_{column}_count"] = df.groupby("molecule_name")[column].transform("count")
    
    for categorical_column in tqdm_notebook(categorical_columns):
        for numerical_column in numerical_columns:
            for stat_value in stat_values:
                df[f"molecule_{categorical_column}_{numerical_column}_{stat_value}"] = df.groupby(["molecule_name", categorical_column])[numerical_column].transform(stat_value)
                df[f"molecule_{categorical_column}_{numerical_column}_{stat_value}_diff"] = df[f"molecule_{categorical_column}_{numerical_column}_{stat_value}"] - df[numerical_column]
                df[f"molecule_{categorical_column}_{numerical_column}_{stat_value}_div"] = df[f"molecule_{categorical_column}_{numerical_column}_{stat_value}"] / df[numerical_column]
    
    return df
    

In [None]:
train = create_features(train)

In [None]:
test = create_features(test)

In [None]:
train.to_csv("../input/bf_train.csv", index=False)
test.to_csv("../input/bf_test.csv", index=False)

In [None]:
featColumns = [
    "molecule_atom_index_0_dist_min",
    "molecule_atom_index_0_dist_max",
    "molecule_atom_index_1_dist_min",
    "molecule_atom_index_0_dist_mean",
    "molecule_atom_index_0_dist_std",
    "dist",
    "molecule_atom_index_1_dist_std",
    "molecule_atom_index_1_dist_max",
    "molecule_atom_index_1_dist_mean",
    "molecule_atom_index_0_dist_max_diff",
    "molecule_atom_index_0_dist_max_div",
    "molecule_atom_index_0_dist_std_diff",
    "molecule_atom_index_0_dist_std_div",
    "atom_0_couples_count",
    "molecule_atom_index_0_dist_min_div",
    "molecule_atom_index_1_dist_std_diff",
    "molecule_atom_index_0_dist_mean_div",
    "atom_1_couples_count",
    "molecule_atom_index_0_dist_mean_diff",
    "molecule_couples",
    "atom_index_1",
    "molecule_dist_mean",
    "molecule_atom_index_1_dist_max_diff",
    "molecule_atom_index_0_y_1_std",
    "molecule_atom_index_1_dist_mean_diff",
    "molecule_atom_index_1_dist_std_div",
    "molecule_atom_index_1_dist_mean_div",
    "molecule_atom_index_1_dist_min_diff",
    "molecule_atom_index_1_dist_min_div",
    "molecule_atom_index_1_dist_max_div",
    "molecule_atom_index_0_z_1_std",
    "y_0",
    "molecule_type_dist_std_diff",
    "molecule_atom_1_dist_min_diff",
    "molecule_atom_index_0_x_1_std",
    "molecule_dist_min",
    "molecule_atom_index_0_dist_min_diff",
    "molecule_atom_index_0_y_1_mean_diff",
    "molecule_type_dist_min",
    "molecule_atom_1_dist_min_div",
    "atom_index_0",
    "molecule_dist_max",
    "molecule_atom_1_dist_std_diff",
    "molecule_type_dist_max",
    "molecule_atom_index_0_y_1_max_diff",
    "molecule_type_0_dist_std_diff",
    "molecule_type_dist_mean_diff",
    "molecule_atom_1_dist_mean",
    "molecule_atom_index_0_y_1_mean_div",
    "molecule_type_dist_mean_div",
    "type",
]

In [None]:
import seaborn as sns

for idx, val in enumerate(train["type"].unique()):
    sns.scatterplot(
        data=train, 
        x=train.loc[train["type"] == val, featColumns[0]], 
        y=train.loc[train["type"] == val, "scalar_coupling_constant"],
    )