In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_data(df):
    # Convert 'IncidentTime' to a numeric representation (e.g., hour of the day)
    df['IncidentTime'] = pd.to_datetime(df['IncidentTime'], format='%H:%M:%S').dt.hour
    df['IncidentTime'] = df['IncidentTime'].astype(str)  # Convert to string for TF-IDF

    # Apply TF-IDF to 'IncidentTime'
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_incident_time = tfidf_vectorizer.fit_transform(df['IncidentTime']).toarray()
    tfidf_incident_time_df = pd.DataFrame(tfidf_incident_time, columns=[f"IncidentTime_TFIDF_{i}" for i in range(tfidf_incident_time.shape[1])])
    
    df = df.reset_index(drop=True)
    df = pd.concat([df, tfidf_incident_time_df], axis=1)
    
    # Drop original 'IncidentTime' column
    df.drop(columns=['IncidentTime'], inplace=True)

    # Convert 'Outcome: Arrested' and 'Outcome: Hospitalised' to binary
    df['Outcome: Arrested'] = df['Outcome: Arrested'].apply(lambda x: 1 if x == 'Yes' else 0).astype(int)
    df['Outcome: Hospitalised'] = df['Outcome: Hospitalised'].apply(lambda x: 1 if x == 'Yes' else 0).astype(int)
    
    return df

def objective(trial):
    df1 = pd.read_csv('qwe/test.csv')
    df2 = pd.read_csv('qwe/use1.csv')

    df1_selected = df1[['Borough', 'Average Score', 'Rank']]
    df2_selected = df2[['Borough', 'IncidentTime', 'Outcome: Arrested', 'Outcome: Hospitalised']]
    df2_selected = preprocess_data(df2_selected)

    merged_df = pd.merge(df1_selected, df2_selected, on='Borough')

    # One-hot encode the 'Borough' column
    merged_df = pd.get_dummies(merged_df, columns=['Borough'], drop_first=True)

    X = merged_df.drop(columns=['Outcome: Hospitalised'])
    y = merged_df['Outcome: Hospitalised']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    param = {
        'objective': 'reg:squarederror',
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 1.0)
    }

    model = xgb.XGBRegressor(**param)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print('Best hyperparameters:', study.best_params)

df1 = pd.read_csv('qwe/test.csv')
df2 = pd.read_csv('qwe/use1.csv')

df1_selected = df1[['Borough', 'Average Score', 'Rank']]
df2_selected = df2[['Borough', 'IncidentTime', 'Outcome: Arrested', 'Outcome: Hospitalised']]
df2_selected = preprocess_data(df2_selected)

merged_df = pd.merge(df1_selected, df2_selected, on='Borough')

# One-hot encode the 'Borough' column
merged_df = pd.get_dummies(merged_df, columns=['Borough'], drop_first=True)

X = merged_df.drop(columns=['Outcome: Hospitalised'])
y = merged_df['Outcome: Hospitalised']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

best_params = study.best_params
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(X_train, y_train)

y_pred = final_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f'Final Mean Squared Error: {mse}')
