# Load imports

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tkinter as tk
from tkinter import filedialog, ttk
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_curve, auc

# Train model on train data csv file

In [48]:
# 1. Indlæs CSV-fil
file_path = "data/train.csv"
df = pd.read_csv(file_path)

# 2. Undersøg data
print("Raw data:")
print(df.head())  # Se de første rækker
print(df.info())  # Se datatype og manglende værdier

# 3. Definér features (X) og target (y)
X = df.drop(columns=["Pawpularity", "Id"])  # Brug alle kolonner undtagen target
y = df["Pawpularity"]

# 4. Opdel i trænings- og testdata
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Skalér data (valgfrit, afhængigt af model)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 6. Træn modellerne
# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
y_pred_linear = linear_model.predict(X_test_scaled)

# Decision Tree Regression
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train_scaled, y_train)
y_pred_tree = decision_tree_model.predict(X_test_scaled)

# 7. Evaluér modellerne
print("Model evaluation:")

# Linear Regression
print("\nLinear Regression:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_linear))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_linear))
print("R^2 Score:", r2_score(y_test, y_pred_linear))

# Decision Tree Clasiification
print("\nDecision Tree Clasiification:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_tree))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_tree))
print("R^2 Score:", r2_score(y_test, y_pred_tree))

Raw data:
                                 Id  Subject Focus  Eyes  Face  Near  Action  \
0  0007de18844b0dbbb5e1f607da0606e0              0     1     1     1       0   
1  0009c66b9439883ba2750fb825e1d7db              0     1     1     0       0   
2  0013fd999caf9a3efe1352ca1b0d937e              0     1     1     1       0   
3  0018df346ac9c1d8413cfcc888ca8246              0     1     1     1       0   
4  001dc955e10590d3ca4673f034feeef2              0     0     0     1       0   

   Accessory  Group  Collage  Human  Occlusion  Info  Blur  Pawpularity  
0          0      1        0      0          0     0     0           63  
1          0      0        0      0          0     0     0           42  
2          0      0        0      1          1     0     0           28  
3          0      0        0      0          0     0     0           15  
4          0      1        0      0          0     0     0           72  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9912 entries, 0

# Test model on Test data csv

In [49]:
# Load test data
test_file_path = "data/test.csv"
df_test = pd.read_csv(test_file_path)

# Check the first few rows
print(df_test.head())
print(df_test.info())  # Check column types

                                 Id  Subject Focus  Eyes  Face  Near  Action  \
0  4128bae22183829d2b5fea10effdb0c3              1     0     1     0       0   
1  43a2262d7738e3d420d453815151079e              0     1     0     0       0   
2  4e429cead1848a298432a0acad014c9d              0     0     0     1       0   
3  80bc3ccafcc51b66303c2c263aa38486              1     0     1     0       0   
4  8f49844c382931444e68dffbe20228f4              1     1     1     0       1   

   Accessory  Group  Collage  Human  Occlusion  Info  Blur  
0          1      1        0      0          1     0     1  
1          0      1        1      0          0     0     0  
2          1      1        1      0          1     1     1  
3          0      0        0      0          0     1     0  
4          1      0        1      0          1     1     0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype 
---  ----

In [50]:
# Remove non-numeric columns (if applicable)
X_test = df_test.drop(columns=["Id"], errors="ignore")  # Remove "ID" if present

# Scale the test data using the same scaler from training
X_test_scaled = scaler.transform(X_test)  # Reuse the trained scaler

# Make predictions using the trained model
linearpredictions = linear_model.predict(X_test_scaled)
tree_predictions = decision_tree_model.predict(X_test_scaled)

# Display results
df_test["Predicted Pawpularity"] = linearpredictions
print("Linear Regression Predictions:")
print(df_test[["Id", "Predicted Pawpularity"]] if "Id" in df_test.columns else df_test[["Predicted Pawpularity"]])

df_test["Predicted Pawpularity"] = tree_predictions
print("Decision Tree Predictions:")
print(df_test[["Id", "Predicted Pawpularity"]] if "Id" in df_test.columns else df_test[["Predicted Pawpularity"]])


Linear Regression Predictions:
                                 Id  Predicted Pawpularity
0  4128bae22183829d2b5fea10effdb0c3              38.229015
1  43a2262d7738e3d420d453815151079e              36.676497
2  4e429cead1848a298432a0acad014c9d              37.072079
3  80bc3ccafcc51b66303c2c263aa38486              38.057867
4  8f49844c382931444e68dffbe20228f4              38.376631
5  b03f7041962238a7c9d6537e22f9b017              43.750232
6  c978013571258ed6d4637f6e8cc9d6a3              34.806061
7  e0de453c1bffc20c22b072b34b54e50f              35.706968
Decision Tree Predictions:
                                 Id  Predicted Pawpularity
0  4128bae22183829d2b5fea10effdb0c3                     26
1  43a2262d7738e3d420d453815151079e                     25
2  4e429cead1848a298432a0acad014c9d                     18
3  80bc3ccafcc51b66303c2c263aa38486                     24
4  8f49844c382931444e68dffbe20228f4                     28
5  b03f7041962238a7c9d6537e22f9b017                     3

# Gui for models removing pictures predicted to have humans

In [None]:
# Function to update Treeview columns dynamically
def update_treeview_columns(columns):
    # Clear existing columns
    tree["columns"] = columns
    for col in columns:
        tree.heading(col, text=col)
        tree.column(col, anchor="center")

# Function to predict pawpularity and display results in a table
def predict_pawpularity():
    # Open a file dialog to select the input CSV file
    input_file = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv")])
    if not input_file:
        return

    # Load the input data
    input_data = pd.read_csv(input_file)

    # Ensure the input data has the same features as the training data
    if 'Id' in input_data.columns:
        ids = input_data['Id']  # Save the 'Id' column for display
        input_data = input_data.drop(columns=['Id'])  # Drop the 'Id' column
    else:
        ids = None

    # Scale the input data
    input_data_scaled = scaler.transform(input_data)

    # Predict using both models
    linear_predictions = linear_model.predict(input_data_scaled)
    tree_predictions = decision_tree_model.predict(input_data_scaled)

    # Create a DataFrame to hold the results
    results = pd.DataFrame({
        "Id": ids if ids is not None else range(len(linear_predictions)),
        "Linear Regression": linear_predictions,
        "Decision Tree": tree_predictions
    })

    # Update Treeview columns
    update_treeview_columns(["Id", "Linear Regression", "Decision Tree"])


    # Clear the Treeview widget
    for row in tree.get_children():
        tree.delete(row)

    # Insert the results into the Treeview widget
    for index, row in results.iterrows():
        tree.insert("", "end", values=list(row))

# Function to calculate and display metrics
def calculate_metrics():
    # Calculate metrics for Linear Regression
    linear_mae = mean_absolute_error(y_test, y_pred_linear)
    linear_mse = mean_squared_error(y_test, y_pred_linear)
    linear_r2 = r2_score(y_test, y_pred_linear)

    # Calculate metrics for Decision Tree
    tree_mae = mean_absolute_error(y_test, y_pred_tree)
    tree_mse = mean_squared_error(y_test, y_pred_tree)
    tree_r2 = r2_score(y_test, y_pred_tree)

    # Perform cross-validation
    linear_cv = cross_val_score(linear_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()
    tree_cv = cross_val_score(decision_tree_model, X_train_scaled, y_train, cv=5, scoring='r2').mean()

    # Update Treeview columns
    update_treeview_columns(["Model", "Mean Abselut Error", "Mean Squared Error", "R² Score", "Cross-Validation R²"])

    # Clear the Treeview widget
    for row in tree.get_children():
        tree.delete(row)

    # Insert metrics into the Treeview widget
    metrics_data = [
        ["Linear Regression", f"{linear_mae:.2f}", f"{linear_mse:.2f}", f"{linear_r2:.2f}", f"{linear_cv:.2f}"],
        ["Decision Tree", f"{tree_mae:.2f}", f"{tree_mse:.2f}", f"{tree_r2:.2f}", f"{tree_cv:.2f}"]
    ]
    for metric in metrics_data:
        tree.insert("", "end", values=metric)


# # Function to plot ROC curve
# def plot_roc_curve():
#     # Predict probabilities for Decision Tree
#     y_prob_tree = decision_tree_model.predict_proba(X_test_scaled)[:, 1]

#     # Calculate ROC curve and AUC
#     fpr, tpr, _ = roc_curve(y_test, y_prob_tree)
#     roc_auc = auc(fpr, tpr)

#     # Plot the ROC curve
#     plt.figure()
#     plt.plot(fpr, tpr, color='blue', lw=2, label=f"Decision Tree (AUC = {roc_auc:.2f})")
#     plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
#     plt.xlabel("False Positive Rate")
#     plt.ylabel("True Positive Rate")
#     plt.title("ROC Curve")
#     plt.legend(loc="lower right")
#     plt.show()

# Function to perform hyperparameter tuning
def tune_hyperparameters():
    # Define hyperparameter grid for Decision Tree
    param_grid = {
        'max_depth': [3, 5, 10, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    # Perform GridSearchCV
    grid_search = GridSearchCV(decision_tree_model, param_grid, cv=5, scoring='r2')
    grid_search.fit(X_train_scaled, y_train)

    # Get the best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    # Update Treeview columns
    update_treeview_columns(["Parameter", "Value"])

    # Clear the Treeview widget
    for row in tree.get_children():
        tree.delete(row)

    # Insert the best parameters and score into the Treeview widget
    for param, value in best_params.items():
        tree.insert("", "end", values=[param, value])
    tree.insert("", "end", values=["Best R² Score", f"{best_score:.2f}"])

# Create the GUI
root = tk.Tk()
root.title("Pawpularity Predictor")

# Add a button to load data and predict
load_button = tk.Button(root, text="Load Data", command=predict_pawpularity)
load_button.pack()

# Add a button to calculate metrics
metrics_button = tk.Button(root, text="Show Metrics", command=calculate_metrics)
metrics_button.pack()

# Create a Treeview widget to display the data
tree = ttk.Treeview(root, show="headings")
tree.pack(fill=tk.BOTH, expand=True)

root.mainloop()

