In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.stats import skew, kurtosis
import warnings
warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Bibliothèques importées avec succès !")


In [None]:
train_df = pd.read_csv('../data/train.csv')
test_df = pd.read_csv('../data/test.csv')

print(f"Données d'entraînement: {train_df.shape}")
print(f"Données de test: {test_df.shape}")
print(f"Variable cible: SalePrice")

print("\nAperçu des données d'entraînement:")
train_df.head()


In [None]:
print("Informations sur les données d'entraînement:")
print(train_df.info())

print("\nTypes de données:")
print(train_df.dtypes.value_counts())


In [None]:
numeric_features = train_df.select_dtypes(include=[np.number]).columns
print(f"Variables numériques: {len(numeric_features)}")
train_df[numeric_features].describe()


In [None]:
categorical_features = train_df.select_dtypes(include=['object']).columns
print(f"Variables catégorielles: {len(categorical_features)}")

for feature in categorical_features[:5]:
    print(f"\n{feature}:")
    print(train_df[feature].value_counts().head())


In [None]:
def analyze_missing_values(df, name):
    missing_data = pd.DataFrame({
        'Total': df.isnull().sum(),
        'Percent': df.isnull().sum() / len(df) * 100
    })
    missing_data = missing_data[missing_data['Total'] > 0].sort_values('Total', ascending=False)
    
    print(f"\nValeurs manquantes - {name}:")
    print(f"Features avec valeurs manquantes: {len(missing_data)}")
    
    if len(missing_data) > 0:
        plt.figure(figsize=(12, 6))
        plt.subplot(1, 2, 1)
        missing_data['Total'].head(15).plot(kind='bar')
        plt.title(f'Top 15 - Valeurs manquantes ({name})', fontsize=14)
        plt.xticks(rotation=45)
        
        plt.subplot(1, 2, 2)
        missing_data['Percent'].head(15).plot(kind='bar', color='orange')
        plt.title(f'Top 15 - % Valeurs manquantes ({name})', fontsize=14)
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()
    
    return missing_data

missing_train = analyze_missing_values(train_df, "Train")
missing_test = analyze_missing_values(test_df, "Test")

print("\nTop 10 des features avec le plus de valeurs manquantes (Train):")
print(missing_train.head(10))
