# Data Importing

In [99]:
import pandas as pd
import numpy as np

dt1 = pd.read_csv("train.csv",index_col='Id')
dt2 = pd.read_csv("test.csv",index_col='Id')
dt_greeks = pd.read_csv("greeks.csv")

In [188]:
#Left join with Greeks file
df_train = pd.merge(dt1, dt_greeks, on='Id', how='left')
df_test = pd.merge(dt2, dt_greeks, on='Id', how='left')

In [3]:
#df_train.info()

In [4]:
#df_train.describe().transpose()

# Feature Selection

In [7]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np

# Separate features and target variable
X = df_train.drop(['Class', 'Id'], axis=1)
y = df_train['Class']

num_attributes = X.select_dtypes(include=[np.number]).columns.tolist()
cat_attributes = ['Alpha', 'Beta', 'Gamma', 'Delta']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Impute missing values with median for numerical attributes
num_imputer = SimpleImputer(strategy='median')
X_train_num_imputed = num_imputer.fit_transform(X_train[num_attributes])
X_test_num_imputed = num_imputer.transform(X_test[num_attributes])

# Scale numerical attributes
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num_imputed)
X_test_scaled = scaler.transform(X_test_num_imputed)

# Impute missing values with most frequent for categorical attributes
cat_imputer = SimpleImputer(strategy='most_frequent')
X_train_cat_imputed = cat_imputer.fit_transform(X_train[cat_attributes])
X_test_cat_imputed = cat_imputer.transform(X_test[cat_attributes])

# One-hot encode categorical attributes
encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train_cat_imputed)
X_test_encoded = encoder.transform(X_test_cat_imputed)

# Combine preprocessed numerical and categorical data
X_train_preprocessed = np.hstack((X_train_scaled, X_train_encoded.toarray()))
X_test_preprocessed = np.hstack((X_test_scaled, X_test_encoded.toarray()))

# Create the RandomForestClassifier
rf = RandomForestClassifier()

# Create the RFE selector
rfe = RFE(estimator=rf, n_features_to_select=25)  # Select top 25 features
rfe.fit(X_train_preprocessed, y_train)

selected_features = [feature for feature, support in zip(X.columns, rfe.support_) if support]


print(selected_features)

['AB', 'AF', 'BC', 'BQ', 'CR', 'DA', 'DE', 'DI', 'DU', 'EE', 'EH', 'EJ', 'FC', 'FI', 'FL', 'GI', 'GL', 'Alpha', 'Beta', 'Gamma']


In [189]:
selected_features =['DU','BQ','Beta','AB','DN','DL','CR','CD ','FL','FR','DE','GL','DY','EU','EP','CH','FI','Delta','DA','BC','EE','CS','FE','AF','Epsilon']

In [None]:
import matplotlib.pyplot as plt
df_train[selected_features].hist(bins=50, figsize=(36,36))
plt.show()

In [9]:
#Correlation
selected_features_target = selected_features + ['Class']
corr_matrix = df_train[selected_features_target].corr()
corr_matrix['Class'].sort_values(ascending=False)

Class    1.000000
AF       0.302638
BQ       0.281257
AB       0.280612
DU       0.261000
FL       0.244185
FE       0.216359
CD       0.171304
BC       0.155882
FR       0.104099
DY       0.062734
CH       0.008144
DN      -0.008478
EU      -0.039739
CS      -0.047438
EP      -0.068383
FI      -0.094327
GL      -0.119202
DE      -0.124977
EE      -0.135324
DL      -0.147716
DA      -0.204612
CR      -0.227547
Name: Class, dtype: float64

In [None]:
from pandas.plotting import scatter_matrix

scatter_matrix(df_train[selected_features],figsize=(12,8))
plt.show()

# Split Train / Test Data

In [198]:
def transform_epsilon_column(column):
    return column.apply(lambda x: [date.split('/')[-1] for date in x])
 
df_train['Epsilon'] = cut_last_4_digits(df_train['Epsilon'])
df_test['Epsilon'] = cut_last_4_digits(df_train['Epsilon'])

from sklearn.model_selection import train_test_split

features = [n for n in df_train[selected_features].columns if n != 'Class']

# Split the features and labels into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df_train[features], df_train['Class'], test_size=0.33, random_state=42)

AttributeError: 'list' object has no attribute 'split'

In [172]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 413 entries, 569 to 102
Data columns (total 25 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   DU       412 non-null    float64
 1   BQ       373 non-null    float64
 2   Beta     413 non-null    object 
 3   AB       413 non-null    float64
 4   DN       413 non-null    float64
 5   DL       413 non-null    float64
 6   CR       413 non-null    float64
 7   CD       413 non-null    float64
 8   FL       412 non-null    float64
 9   FR       413 non-null    float64
 10  DE       413 non-null    float64
 11  GL       412 non-null    float64
 12  DY       413 non-null    float64
 13  EU       413 non-null    float64
 14  EP       413 non-null    float64
 15  CH       413 non-null    float64
 16  FI       413 non-null    float64
 17  Delta    413 non-null    object 
 18  DA       413 non-null    float64
 19  BC       413 non-null    float64
 20  EE       413 non-null    float64
 21  CS       413 n

# Clean Data

In [197]:
from sklearn.cluster import KMeans
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Default numeric transformation
num_pipeline = make_pipeline(SimpleImputer(strategy='median'), StandardScaler())

# Default categorical transformation
cat_pipeline = make_pipeline(
    SimpleImputer(strategy='most_frequent'),
    OneHotEncoder(handle_unknown='ignore'))

# Log numeric transformation
log_pipeline = make_pipeline(
    FunctionTransformer(np.log1p, validate=False),
    SimpleImputer(strategy='mean')
)

# Transformation for Epsilon
epsilon_pipeline = make_pipeline(
    #FunctionTransformer(transform_epsilon_column),
    OneHotEncoder(handle_unknown='ignore')
)

num_attributes = X_train.select_dtypes(include=[np.number]).columns.tolist()
cat_attributes = ['Beta', 'Delta','Epsilon']

preprocessing = ColumnTransformer([
    ('num', num_pipeline, num_attributes),
    ('log', log_pipeline, ['BQ']),
    ('cat', cat_pipeline, cat_attributes)
])

df_train_prepared = preprocessing.fit_transform(X_train)

# Get the feature names from each transformer within ColumnTransformer

feature_names = []
for name, transformer, features in preprocessing.transformers_:
    if name == 'num':
        feature_names.extend(num_attributes)
    elif name == 'log':
        feature_names.append('BQ_log')
    elif name == 'cat':
        feature_names.extend(transformer.named_steps['onehotencoder'].get_feature_names_out(cat_attributes))

# Print the feature names
print(feature_names)

TypeError: unhashable type: 'list'

# Model Selection

## XGBoost

In [192]:
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.metrics import make_scorer, log_loss
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, RepeatedStratifiedKFold

pipeline = make_pipeline(preprocessing,xgb.XGBClassifier())

y_train_1 = (y_train==1)
y_test_1 = (y_test==1)


# Perform cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='neg_log_loss')

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())


Accuracy: 0.9705882352941176
Cross-Validation Scores: [-0.21618011 -0.0981476  -0.13262463 -0.0971876  -0.3177311 ]
Mean Accuracy: -0.17237421030989103


In [193]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_score, StratifiedKFold, RepeatedStratifiedKFold


# Evaluate the model on test data
y_pred = pipeline.predict(X_test)
y_train_pred = pipeline.predict(X_train)

y_train_1 = (y_train==1)
y_test_1 = (y_test==1)

# Perform cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X_test, y_test_1, cv=kfold, scoring='neg_log_loss')
precision = precision_score(y_test_1,y_pred)
recall = recall_score(y_test_1,y_pred)
cm = confusion_matrix(y_test_1,y_pred)
f1 = f1_score(y_test_1,y_pred)

# Print the cross-validation scores
print("Cross-Validation Scores:", scores)
print("Mean Accuracy:", scores.mean())
print("Standard Dev:", scores.std())
print("Precision:", precision)
print("Recall:", recall)
print(cm)
print("F1:",f1)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Cross-Validation Scores: [-0.23762726 -0.17998956 -0.44262443 -0.15286851 -0.09161719]
Mean Accuracy: -0.22094539082651737
Standard Dev: 0.12038065260449374
Precision: 0.9743589743589743
Recall: 0.8837209302325582
[[160   1]
 [  5  38]]
F1: 0.9268292682926831
Accuracy: 0.9705882352941176


In [None]:
y_pred

In [195]:
# Predict the target variable for the test data
final_test = pipeline.predict(df_test[x_features])

# Calculate the probabilities for class 0
p0 = 1 - final_test

submission = pd.DataFrame(df_test["Id"], columns=["Id"])
submission["class_0"] = p0
submission["class_1"] = 1 - p0
submission.to_csv('submission.csv', index=False)

In [202]:
import inspect

source_code = inspect.getsource(cut_last_4_digits)

print(source_code)

def cut_last_4_digits(column):
    column_copy = column.copy()
    for i, date in enumerate(column):
        if date:
            try:
                year = date.split('/')[-1]
                column_copy[i] = year
            except IndexError:
                column_copy[i] = 'UNK'
        else:
            column_copy[i] = 'UNK'
    return column_copy

