In [None]:

%pip install scikit-learn
%pip install pgvector
%pip install graphviz

In [None]:
import pandas as pd
import numpy as np
import json
import xgboost as xgb
from sqlalchemy import create_engine
from sqlalchemy.sql import text
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score, fbeta_score
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
def parse_vector_fast(vec_str):
            if isinstance(vec_str, str):
                try:
                    # Safely parse the string using the optimized json library
                    return np.array(json.loads(vec_str))[:50]
                except json.JSONDecodeError:
                    # Handle cases where the string is not valid JSON
                    return None
            elif hasattr(vec_str, '__iter__'): # Check if it's already a list/array
                return np.array(vec_str)[:50]
            return None # Handle other potential null/malformed data



engine = create_engine("postgresql+psycopg2://admin:admin@localhost:5432/SYAS")
sql_query = text("SELECT * FROM matches_values")
processed_chunks = []
with engine.connect().execution_options(stream_results=True) as conn:
    #df = pd.read_sql(sql_query, conn)
    df_iterator = pd.read_sql(sql_query, conn, chunksize=10000)
    for i, chunk_df in enumerate(df_iterator):
        print(f"Processing chunk {i+1}...")
        for col in chunk_df:
            if 'embedding' in col:
                chunk_df[col] = chunk_df[col].apply(parse_vector_fast)
        processed_chunks.append(chunk_df)

df = pd.concat(processed_chunks, ignore_index=True)
processed_chunks = None

In [None]:
cols_to_add = []
cols_to_remove = []
for col in df.columns.copy():
    if df[col].dtype == 'object' and 'embedding' not in col:
        df[col] = df[col].str.replace(' ', '')
        df[col] = df[col].replace('', np.nan) 
        should_drop_first = not col.endswith('list')
        dummy_columns = df[col].str.get_dummies(sep=';').astype(int).add_prefix(col + '_')
        if should_drop_first:
            # Remove extra column 
            dummy_columns = dummy_columns.iloc[:, 1:]
        print(col, should_drop_first)
        cols_to_remove.append(col)
        cols_to_add.append(dummy_columns)
    elif 'embedding' in col:
        print('embedding', col)
        first_valid_vector = next((v for v in df[col] if v is not None), None)
        vec_len = len(first_valid_vector)
        nan_placeholder = [np.nan] * vec_len
        data_for_df = [v if v is not None else nan_placeholder for v in df[col].tolist()]
        vector_df = pd.DataFrame(
            data_for_df,
            index=df.index,
            dtype=np.float32  
        )
        vector_df.columns = [f'{col}_{i}' for i in range(vector_df.shape[1])]
        cols_to_remove.append(col)
        cols_to_add.append(vector_df)
    
df = pd.concat([df.drop(cols_to_remove, axis=1)] + cols_to_add, axis=1)
cols_to_add = None
cols_to_remove = None

In [None]:
distances = pd.read_csv('match_distances.csv')
df = df.merge(distances, left_on='match_id', right_on='id').drop(['id'], axis=1)
# df = df.drop(['match_id'], axis=1)

In [None]:
x = df.drop('match_status_Declined', axis=1)
y =  1 - df['match_status_Declined']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
x_test['match_id'].to_csv('test_match_ids.csv', index=False)
x_test = x_test.drop(['match_id'], axis=1)
x_train = x_train.drop(['match_id'], axis=1)

In [None]:
distances = None

In [None]:
model = xgb.XGBClassifier(
    objective='binary:logistic', 
    eval_metric='logloss',     
    use_label_encoder=False,    
    n_estimators=1000,
    subsample = .8,
    learning_rate=0.05,
    scale_pos_weight=40,
    colsample_bytree=0.7,
    max_depth=6,
    random_state=42
)

In [None]:
model.fit(x_train, y_train)

In [None]:
x = None
y = None

In [None]:
y_pred = model.predict(x_test)
y_pred_proba = model.predict_proba(x_test)[:, 1]

In [None]:
f1 = f1_score(y_test, y_pred)
print(f"F1 Score: {f1:.4f}")

In [None]:
y_train_pred = model.predict(x_train)
f2 = fbeta_score(y_train, y_train_pred, beta=2)
print(f"F2 Score Train: {f2:.4f}")

In [None]:
f2 = fbeta_score(y_test, y_pred, beta=2)
print(f"F2 Score Test: {f2:.4f}")

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Declined', 'Both Approve'],
            yticklabels=['Declined', 'Both Approve'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
model.save_model("top_xgb_model_1.json")

In [None]:
custom_threshold = 0.55
y_pred_custom = (y_pred_proba > custom_threshold).astype(int)
cm = confusion_matrix(y_test, y_pred_custom)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Declined', 'Both Approve'],
            yticklabels=['Declined', 'Both Approve'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix (55% Threshold)')
plt.show()

In [None]:
feature_importances = pd.DataFrame({
    'feature': model.feature_names_in_,
    'importance': model.feature_importances_
}).sort_values('importance', ascending=False)
num_features = len(model.feature_names_in_)
plot_height = max(6, num_features / 2.5)

plt.figure(figsize=(10, plot_height))
sns.barplot(x='importance', y='feature', data=feature_importances)
plt.title('Feature Importance')
plt.xlabel('Importance Score')
plt.ylabel('Features')
plt.tight_layout() 
plt.show()


In [None]:
total = 0
for index, row in feature_importances.iterrows():
    if "embedding" in row['feature']:
        total += row['importance']
print(total)
