In [None]:
import pandas as pd
import numpy as np
import re
import plotly.express as px
import plotly.graph_objs as go
import category_encoders as ce

from sklearn.preprocessing import StandardScaler, LabelEncoder, label_binarize

import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from keras.wrappers.scikit_learn import KerasRegressor
import tensorflow as tf


In [None]:
data = pd.read_csv('./data/train_clean.csv', na_values=[], keep_default_na=False)
data.head()

In [None]:
cols_to_use =[
  'MSZoning', 'LotArea', 
  'OverallQual', 'YearBuilt', 'YearRemodAdd', 
  'ExterQual', 'FullBath', 'BedroomAbvGr', 
  'TotRmsAbvGrd', 'SalePrice', 'Neighborhood'
]
df = data.copy()[cols_to_use]

df = pd.DataFrame(df)

df.head()
df = df.apply(lambda col: col.astype(int) if col.dtype == bool else col)
df['OverallQual'] = df['OverallQual'].astype('int32')

### Omvandling av data

In [None]:
df_encoded = pd.get_dummies(df, columns=['MSZoning'], drop_first=True)
df_encoded = df_encoded.apply(lambda col: col.astype(int) if col.dtype == bool else col)
ordinal_mapping = [
    {'col': 'ExterQual', 'mapping': {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}},
]

encoder = ce.OrdinalEncoder(mapping=ordinal_mapping, cols=['ExterQual'])
df_encoded = encoder.fit_transform(df_encoded)

In [None]:
scaler = StandardScaler()

numerical_columns = df_encoded.drop('SalePrice', axis=1).select_dtypes(include=['int64']).columns.to_list()
df_encoded[numerical_columns] = scaler.fit_transform(df_encoded[numerical_columns])


In [None]:
label_encoder = LabelEncoder()

df_encoded['NBHNames'] = df_encoded['Neighborhood'].values
df_encoded['Neighborhood'] = label_encoder.fit_transform(df_encoded['Neighborhood'])

**Correlation Matrix**

In [None]:
import re

def add_spaces(name):
    return re.sub(r'(?<!^)(?=[A-Z])', ' ', name)

In [None]:
correlation_matrix = df_encoded.select_dtypes(include=['int64', 'float64']).corr()
formatted_columns = [add_spaces(col) for col in correlation_matrix.columns]

fig = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=formatted_columns,
                   y=formatted_columns,
                   colorscale='RdBu',
                   zmid=0))
fig.update_layout(
    title='Correlation Matrix',
    xaxis_nticks=36)
fig.show()

In [None]:
saleprice_corr = correlation_matrix['SalePrice'].sort_values(ascending=False)
features = saleprice_corr.drop('SalePrice').abs().sort_values(ascending=False)

features_df = features.reset_index()
features_df.columns = ['Columns', 'Correlation']
formatted_columns = [add_spaces(col) for col in features_df['Columns'].values]

fig = px.bar(
    features_df,
    x='Correlation',
    y=formatted_columns,
    orientation='h',
    title='Korrelation till SalePrice',
    labels={'Correlation': 'Korrelation med SalePrice', 'Columns': 'Kolumner'},
    color='Correlation',
    color_continuous_scale='Viridis'
)

fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [None]:
features = ['FullBath', 'TotRmsAbvGrd', 'YearBuilt', 'YearRemodAdd', 'LotArea', 'BedroomAbvGr']

X = df_encoded[features]
# X = df_encoded.drop(['SalePrice', 'Neighborhood', 'NBHNames'], axis=1)
y = df_encoded['SalePrice']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)


model.fit(X_train, y_train)


y_pred = model.predict(X_test)

r2 = model.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)

print(f'Modellens R^2: {r2:.2f}')
print(f'Medelkvadratfel (MSE): {mse:.2f}')


In [None]:
neighborhood_mapping = {
    'Blmngtn': 'Bloomington Heights',
    'Blueste': 'Bluestem',
    'BrDale': 'Briardale',
    'BrkSide': 'Brookside',
    'ClearCr': 'Clear Creek',
    'CollgCr': 'College Creek',
    'Crawfor': 'Crawford',
    'Edwards': 'Edwards',
    'Gilbert': 'Gilbert',
    'IDOTRR': 'Iowa DOT and Rail Road',
    'MeadowV': 'Meadow Village',
    'Mitchel': 'Mitchell',
    'NAmes': 'North Ames',
    'NoRidge': 'Northridge',
    'NPkVill': 'Northpark Villa',
    'NridgHt': 'Northridge Heights',
    'NWAmes': 'Northwest Ames',
    'OldTown': 'Old Town',
    'SWISU': 'S&W of Iowa State University',
    'Sawyer': 'Sawyer',
    'SawyerW': 'Sawyer West',
    'Somerst': 'Somerset',
    'StoneBr': 'Stone Brook',
    'Timber': 'Timberland',
    'Veenker': 'Veenker'
}

ordered_neighborhoods = list(neighborhood_mapping.keys())


labels = {
  'Neighborhood': 'Neighborhood (Short)',
  'SalePrice': 'Sale Price (USD $)',
  'PriceCategory': 'Price Category (Low, Medium, High)',
  'Neighborhood_Full_Name': 'Neighborhood',
  'LotArea': 'Lot Area (square feet)',
  'YearBuilt': 'Year Built',
  'YearRemodAdd': 'Year Remodeled',
  'MSZoning': 'Zoning Classification'
}

### Residualer 

In [None]:
residuals_df = pd.DataFrame({
    'Predicerade värden': y_pred,
    'Residualer': y_test - y_pred
})

fig = px.scatter(residuals_df, x='Predicerade värden', y='Residualer',
                 title='Residualer vs Predicerade värden',
                 labels={'Predicerade värden': 'Predicerade värden', 'Residualer': 'Residualer'})


fig.add_shape(type="line", x0=min(y_pred), x1=max(y_pred), y0=0, y1=0, line=dict(color='Red', dash='dash'))

fig.show()


### Feature Importance

In [None]:
importance = model.feature_importances_

importance_df = pd.DataFrame({
    'Feature': features,
    'Importance': importance
})

fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h',
             title='Feature Importance', labels={'Importance': 'Viktighet', 'Feature': 'Funktioner'})

fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [None]:
# Filtrera bort funktioner med låg betydelse
significant_features = importance_df[importance_df['Importance'] > 0.01]['Feature'].tolist()

# Träna modellen igen med bara viktiga funktioner
X_train_significant = X_train[significant_features]
X_test_significant = X_test[significant_features]

model.fit(X_train_significant, y_train)
y_pred_significant = model.predict(X_test_significant)

# Utvärdera den nya modellen
r2_significant = model.score(X_test_significant, y_test)
mse_significant = mean_squared_error(y_test, y_pred_significant)

print(f'R^2 efter att ha filtrerat funktioner: {r2_significant:.2f}')
print(f'Medelkvadratfel (MSE): {mse_significant:.2f}')

In [None]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7]
}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f'Bästa parametrar: {grid_search.best_params_}')
print(f'Bästa R^2: {grid_search.best_score_}')

In [None]:
results_df = pd.DataFrame({'Predicerade': y_pred, 'Faktiska': y_test})

fig = px.scatter(results_df, x='Faktiska', y='Predicerade',
                 title='Predicerade vs Faktiska huspriser',
                 labels={'Faktiska': 'Faktiska huspriser', 'Predicerade': 'Predicerade huspriser'})
fig.add_shape(type="line", x0=results_df['Faktiska'].min(), x1=results_df['Faktiska'].max(),
              y0=results_df['Faktiska'].min(), y1=results_df['Faktiska'].max(),
              line=dict(color='Red', dash='dash'))

fig.show()

In [None]:
fig = px.box(
    df, 
    x='Neighborhood', 
    y='SalePrice', 
    title='Sale Price Distribution by Neighborhood',
    labels=labels,
    category_orders={'Neighborhood': ordered_neighborhoods},
    color='Neighborhood',
    color_discrete_sequence=px.colors.qualitative.Alphabet,
)
for trace in fig.data:
    trace.name = neighborhood_mapping[trace.name]

fig.show()

In [None]:
df_encoded = df_encoded.astype({col: 'int' for col in df_encoded.select_dtypes('bool').columns})


In [None]:
gilbert_df = df_encoded.loc[df_encoded['Neighborhood'] == 'Gilbert']

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split

X = df_encoded.drop(['SalePrice', 'Neighborhood', 'NBHNames'], axis=1)
y = df_encoded['SalePrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = tf.keras.models.Sequential()

# Input layer
model.add(tf.keras.layers.Dense(10, activation='relu', input_shape=(X_train.shape[1],)))

# Hidden layer
model.add(tf.keras.layers.Dense(10, activation='relu'))

# Output layer
model.add(tf.keras.layers.Dense(1))  


model.compile(
    optimizer='adam', 
    loss='mean_squared_error', 
    metrics=['mae']  
)

model.fit(X_train, y_train, epochs=100, batch_size=10)

loss, mae = model.evaluate(X_test, y_test)
print(f'Model Loss (MSE): {loss}')
print(f'Mean Absolute Error (MAE): {mae}')


In [None]:
y_pred = model.predict(X_test)


results_df = pd.DataFrame({
    'Faktiska priser': y_test,
    'Predicerade priser': y_pred.flatten()
})


fig = px.scatter(results_df, x='Faktiska priser', y='Predicerade priser',
                 title='Predicerade vs Faktiska huspriser',
                 labels={'Faktiska priser': 'Faktiska huspriser', 'Predicerade priser': 'Predicerade huspriser'})


fig.add_shape(type="line", x0=results_df['Faktiska priser'].min(), x1=results_df['Faktiska priser'].max(),
              y0=results_df['Faktiska priser'].min(), y1=results_df['Faktiska priser'].max(),
              line=dict(color='Red', dash='dash'))

fig.show()

In [None]:
residuals = y_test - y_pred.flatten()

fig = px.scatter(x=y_pred.flatten(), y=residuals,
                 title='Residualer vs Predicerade värden',
                 labels={'x': 'Predicerade värden', 'y': 'Residualer'})

fig.add_shape(type="line", x0=y_pred.min(), x1=y_pred.max(), y0=0, y1=0,
              line=dict(color='Red', dash='dash'))

fig.show()