In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer

In [None]:
data = pd.read_csv('./data/train.csv')
df = data.copy()

df = pd.DataFrame(df)

In [None]:
data.dtypes.value_counts()

In [None]:
data.columns

### **Fixa NaN Värden**

**Visa alla NaN-värden**

In [None]:
null_count = df.isnull().sum()

null_count = null_count[null_count > 0]

dtypes_with_null = df[null_count.index].dtypes

null_info = pd.DataFrame({
    'NullCount': null_count,
    'Dtype': dtypes_with_null
})

print("\nSammanfattning av nullvärden och datatyper:")
print(null_info)

**Hantera alla NaN-värden**

In [None]:
df['HasGarage'] = df['GarageYrBlt'].notnull().astype(int)
df['HasPool'] = df['PoolArea'].notnull().astype(int)
df['HasFireplace'] = df['Fireplaces'].notnull().astype(int)
df['HasBasement'] = df['TotalBsmtSF'].notnull().astype(int)

bins = [0, 129975, 214000, float('inf')]
labels = ['Low', 'Medium', 'High']

df['PriceCategory'] = pd.cut(df['SalePrice'], bins=bins, labels=labels, right=True)

In [None]:
df.dtypes.value_counts()

* <small> Numeriska kolumner: </small>

In [None]:
numeric_nan_cols = {
  'MasVnrArea': 0,
  'GarageYrBlt': -1
}
df.fillna(numeric_nan_cols, inplace=True)

* <small> Icke numeriska kolumner: </small>

In [None]:
non_numeric_cols = df.select_dtypes(exclude=['number', 'category']).columns.tolist()
non_numerics_to_fill = [col for col in non_numeric_cols if col != 'LotFrontage']
df[non_numerics_to_fill] = df[non_numerics_to_fill].fillna('NA')

df = pd.DataFrame(df)

**Omvandla Datatyper**

In [None]:
print(df.isnull().sum())
df.dtypes.value_counts()

**Använder KNNImputer för att fylla i missing values för `LotFrontage`**

<small> 
  Korrelation mellan numreriska kolumner och <code>Lotfrontage</code>
</small>

In [None]:
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()

df_corr = df[numerical_cols].copy()
df_corr.fillna(df_corr.median(), inplace=True)

correlation_matrix = df_corr.corr()
lotfrontage_corr = correlation_matrix['LotFrontage'].sort_values(ascending=False)
top_features = lotfrontage_corr.drop('LotFrontage').abs().sort_values(ascending=False).head(10)

top_features_df = top_features.reset_index()
top_features_df.columns = ['Columns', 'Correlation']

fig = px.bar(
    top_features_df,
    x='Correlation',
    y='Columns',
    orientation='h',
    title='Top 10 Funktioner med Högst Korrelation till LotFrontage',
    labels={'Correlation': 'Korrelation med LotFrontage', 'Columns': 'Kolumner'},
    color='Correlation',
    color_continuous_scale='Viridis'
)

fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

<small> 
<code>KNNImputer:</code>

 En imputeringsmetod som använder K-Nearest Neighbors-algoritmen för att fylla i saknade värden baserat på närliggande datapunkter.
</small>

In [None]:
# top 10 correlation
features = top_features_df['Columns'].tolist()
features.append('LotFrontage')


categorical_cols = ['Neighborhood', 'BldgType', 'HouseStyle']
df_encoded = pd.get_dummies(df[features + categorical_cols], drop_first=True)

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_encoded)

imputer = KNNImputer(n_neighbors=5)
df_imputed_scaled = imputer.fit_transform(df_scaled)

# Omvandlar imputerade datan till en Dataframe med samma kolumnnamn som df_encoded
df_imputed = pd.DataFrame(df_imputed_scaled, columns=df_encoded.columns)

# Återställer till ursprunglig scala på data
df_imputed_original_scale = scaler.inverse_transform(df_imputed)

# Skapar dataframe med inverterade datan
df_imputed_original_scale = pd.DataFrame(df_imputed_original_scale, columns=df_encoded.columns)

nan_indices = df[df['LotFrontage'].isnull()].index
imputed_lotfrontage = df_imputed_original_scale['LotFrontage']

original_nan_values = df.loc[nan_indices, 'LotFrontage']
imputed_values = imputed_lotfrontage.loc[nan_indices]

comparison_df = pd.DataFrame({
    'Original_LotFrontage': original_nan_values,
    'Imputed_LotFrontage': imputed_values
})

df['LotFrontage'] = imputed_lotfrontage

In [None]:
comparison_df

In [None]:
df.to_csv('./data/train_clean.csv', index=False)