
# Загружаем данные

In [1]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

print(train_df.shape, test_df.shape)

(1460, 81) (1459, 80)


In [None]:
id_test = test_df.Id

train_df.sample(5)

# Exploratory data analysis, (EDA) 

In [None]:
A1 = set(train_df.Id.values)
A2 = set(test_df.Id.values)
print(len(A1 - A2), len(A1 & A2), len(A2 - A1))

показывается что нету одинаковых обьектов в данных. Нету пересечений

In [None]:
train_df.describe()

In [None]:
numeric_data = train_df.select_dtypes([np.number])
numeric_data_mean = numeric_data.mean()
y = train_df['SalePrice']
train_df.fillna(numeric_data_mean, inplace=True)
correlations = train_df[numeric_data.columns].corrwith(y).sort_values(ascending=False)
print('Корреляция числовых данных по отношению к y(SalePrice)')
correlations

**Визуализация на линейную зависимость числовых данных по отношению к y.**

In [None]:
plot = sns.barplot(y=correlations.index, x=correlations, color='black')
plot.figure.set_size_inches(15, 10)
plt.xlabel('CORR COEF')
plt.ylabel('FEATURES')

In [None]:
index_to_drop = correlations.loc[correlations.abs() < 0.2]
index_to_drop.drop(["PoolArea", 'BsmtHalfBath', 'YrSold', 'BedroomAbvGr'], inplace=True)
print("Колонны у которых малая корреляция.")
index_to_drop

In [None]:
train_df.drop(columns=index_to_drop.index, inplace=True)
test_df.drop(columns=index_to_drop.index, inplace=True)
display(train_df.shape, test_df.shape)

Удаляю столбов которые имеют очень малую кореляцию (0.2) и также которые не нужны. 

**Начинаю находить outliers и удалять их**

In [None]:
train_df.select_dtypes([np.number]).shape

In [None]:
train_df.select_dtypes([np.number])

In [None]:
train_df.dtypes[train_df.dtypes != 'object']

In [None]:
plt.scatter(x='LotFrontage', y='SalePrice', data=train_df)

In [None]:
Q1 = train_df['LotFrontage'].quantile(0.25)
Q3 = train_df['LotFrontage'].quantile(0.75)
IQR = Q3 - Q1
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['LotFrontage'].plot(kind='box')
plt.show()

In [None]:
outliers = train_df[(train_df['LotFrontage'] < lower_) | (train_df['LotFrontage'] > upper_)]
train_df.drop(outliers.index, inplace=True)
train_df[train_df['LotFrontage'] < lower_]

In [None]:
plt.scatter(x='LotArea', y="SalePrice", data=train_df)


In [None]:
Q1 = train_df['LotArea'].quantile(0.25)
Q3 = train_df['LotArea'].quantile(0.75)
IQR = Q3 - Q1
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['LotArea'].plot(kind='box')
plt.show()

In [None]:
outliers = train_df[(train_df['LotArea'] <= lower_) | (train_df['LotArea'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['LotArea'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='YearBuilt', y='SalePrice', data=train_df)


In [None]:
Q1 = train_df['YearBuilt'].quantile(0.25)
Q3 = train_df['YearBuilt'].quantile(0.75)
IQR = Q3 - Q1
lower_ = Q1 - 1.5 * IQR
train_df['YearBuilt'].plot(kind='box')
plt.show()
print(lower_)

In [None]:
plt.scatter(x='YearRemodAdd', y='SalePrice', data=train_df)
plt.show()

In [None]:
Q1 = train_df['YearRemodAdd'].quantile(0.25)
Q3 = train_df['YearRemodAdd'].quantile(0.75)
IQR = Q3 - Q1
lower_ = Q1 - 1.5 * IQR
train_df['YearRemodAdd'].plot(kind='box')
plt.show()
print(lower_)

In [None]:
plt.scatter(x='MasVnrArea', y='SalePrice', data=train_df)
plt.show()

In [None]:
Q1 = train_df['MasVnrArea'].quantile(0.25)
Q3 = train_df['MasVnrArea'].quantile(0.75)
IQR = Q3 - Q1
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 3.5 * IQR
print(lower_, upper_)
train_df['MasVnrArea'].plot(kind='box')
plt.show()

In [None]:
outliers = train_df[(train_df['MasVnrArea'] <= lower_) | (train_df['MasVnrArea'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['MasVnrArea'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='BsmtFinSF1', y='SalePrice', data=train_df)


In [None]:
Q1 = train_df['BsmtFinSF1'].quantile(0.25)
Q3 = train_df['BsmtFinSF1'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['BsmtFinSF1'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['BsmtFinSF1'] <= lower_) | (train_df['BsmtFinSF1'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['BsmtFinSF1'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='TotalBsmtSF', y='SalePrice', data=train_df)


In [None]:
train_df.drop(train_df.query('TotalBsmtSF > 2000').index, inplace=True)
train_df.query('TotalBsmtSF > 2000')

In [None]:
plt.scatter(x='1stFlrSF', y="SalePrice", data=train_df)

In [None]:
Q1 = train_df['1stFlrSF'].quantile(0.25)
Q3 = train_df['1stFlrSF'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['1stFlrSF'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['1stFlrSF'] <= lower_) | (train_df['1stFlrSF'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['1stFlrSF'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='2ndFlrSF', y='SalePrice', data=train_df)

In [None]:
Q1 = train_df['2ndFlrSF'].quantile(0.25)
Q3 = train_df['2ndFlrSF'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['2ndFlrSF'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['2ndFlrSF'] <= lower_) | (train_df['2ndFlrSF'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['2ndFlrSF'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='GrLivArea', y='SalePrice', data=train_df)


In [None]:
Q1 = train_df['GrLivArea'].quantile(0.25)
Q3 = train_df['GrLivArea'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['GrLivArea'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['GrLivArea'] <= lower_) | (train_df['GrLivArea'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['GrLivArea'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='BsmtFullBath', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='BsmtHalfBath', y='SalePrice', data=train_df)


In [None]:
plt.scatter(x='FullBath', y='SalePrice', data=train_df)


In [None]:
plt.scatter(x='BedroomAbvGr', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='TotRmsAbvGrd', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='Fireplaces', y='SalePrice', data=train_df)

In [None]:
plt.scatter(x='GarageYrBlt', y='SalePrice', data =train_df)

In [None]:
Q1 = train_df['GarageYrBlt'].quantile(0.25)
Q3 = train_df['GarageYrBlt'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['GarageYrBlt'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['GarageYrBlt'] <= lower_) | (train_df['GarageYrBlt'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['GarageYrBlt'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='GarageCars', y='SalePrice', data =train_df)

In [None]:
Q1 = train_df['GarageCars'].quantile(0.25)
Q3 = train_df['GarageCars'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['GarageCars'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['GarageCars'] <= lower_) | (train_df['GarageCars'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['GarageCars'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='GarageArea', y='SalePrice', data =train_df)

In [None]:
Q1 = train_df['GarageArea'].quantile(0.25)
Q3 = train_df['GarageArea'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['GarageArea'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['GarageArea'] <= lower_) | (train_df['GarageArea'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['GarageArea'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='WoodDeckSF', y='SalePrice', data =train_df)

In [None]:
Q1 = train_df['WoodDeckSF'].quantile(0.25)
Q3 = train_df['WoodDeckSF'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['WoodDeckSF'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['WoodDeckSF'] <= lower_) | (train_df['WoodDeckSF'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['WoodDeckSF'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='OpenPorchSF', y='SalePrice', data =train_df)


In [None]:
Q1 = train_df['OpenPorchSF'].quantile(0.25)
Q3 = train_df['OpenPorchSF'].quantile(0.75)
IQR = Q3 - Q1 
lower_ = Q1 - 1.5 * IQR
upper_ = Q3 + 1.5 * IQR
print(lower_, upper_)
train_df['OpenPorchSF'].plot(kind='box')

In [None]:
outliers = train_df[(train_df['OpenPorchSF'] <= lower_) | (train_df['OpenPorchSF'] >= upper_)]
train_df.drop(outliers.index, inplace=True)
train_df['OpenPorchSF'].plot(kind='box')
plt.show()

In [None]:
plt.scatter(x='PoolArea', y='SalePrice', data =train_df)


In [None]:
plt.scatter(x='YrSold', y='SalePrice', data =train_df)

**Еще раз чекаю корелляцию данных, после удаления выбросов**

In [None]:
numeric_data = train_df.select_dtypes([np.number])
numeric_data_mean = numeric_data.mean()
y = train_df['SalePrice']
train_df.fillna(numeric_data_mean, inplace=True)
correlations = train_df[numeric_data.columns].corrwith(y).sort_values(ascending=False)
correlations

In [None]:
plot = sns.barplot(y=correlations.index, x=correlations, color='black')
plot.figure.set_size_inches(15, 10)
plt.xlabel('CORR COEF')
plt.ylabel('FEATURES')
index_to_drop = correlations.loc[correlations.abs() < 0.2]
print('Колонны у которых корелляция меньше чем 0.2')
index_to_drop

In [None]:
train_df.drop(columns=index_to_drop.index, inplace=True)
test_df.drop(columns=index_to_drop.index, inplace=True)
display(train_df.shape, test_df.shape)

## Меняю категориальные данные на числовые используя OrdinalEncoder

In [None]:
from sklearn.preprocessing import OrdinalEncoder

categorical = train_df.select_dtypes(include=['object'])
encoder = OrdinalEncoder()
train_df[categorical.columns] = encoder.fit_transform(train_df[categorical.columns])
train_df

In [None]:
categorical_t = test_df.select_dtypes(include=['object'])
encoder = OrdinalEncoder()
test_df[categorical_t.columns] = encoder.fit_transform(test_df[categorical.columns])
test_df

In [None]:
train_df.isna().sum()

In [None]:
test_df.isna().sum()

In [None]:
percent = 0
for i in train_df.columns:
    percent = (train_df[i].isna().sum()/train_df[i].shape[0])*100
    if percent > 0:
        train_df.drop(columns=i, inplace=True)
        percent = 30
    else:
        percent = 0
train_df.sample(15)

In [None]:
percent = 0
for i in test_df.columns:
    percent = (test_df[i].isna().sum()/test_df[i].shape[0])*100
    if percent > 0:
        test_df.drop(columns=i, inplace=True)
        percent = 30
    else:
        percent = 0
test_df.sample(15)

Удалил колонны которые имеют больше 30% пустые ячейки 

In [None]:
train_df = train_df.apply(lambda x: x.fillna(x.median()), axis=0)
train_df.isna().sum()

In [None]:
test_df = test_df.apply(lambda x: x.fillna(x.median()), axis=0)
test_df.isna().sum()

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV

X = train_df.loc[:, train_df.columns!='SalePrice']
y = train_df['SalePrice']
display(X, y)

In [None]:
columns = [i for i in X.columns if i in X.columns and i in test_df.columns]
X = train_df[columns]
X

In [None]:
for i in test_df:
    if i not in X:
        dropped = i
test_df.drop(columns=dropped, inplace=True)
test_df.shape

In [None]:
if set(X.columns) == set(test_df.columns):
    print(True)

**Делю данные на обучающую и тестовую.**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)

## Первая модель применяю randomforest 

In [None]:
from sklearn.ensemble import RandomForestRegressor

ranmodel = RandomForestRegressor()
ranmodel.fit(X_train, y_train)
y_pred = ranmodel.predict(X_test)
score_forest = ranmodel.score(X_test, y_test)
print(f'Скоринг модели Random Forest: {score_forest:.4f}')

## Модель по XGBoost

In [None]:
from xgboost import XGBRegressor

xgb_model = XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
score_xgb = xgb_model.score(X_test, y_test)
print(f'Скоринг XGBoost модели: {score_xgb:.4f} ')

## Перевожу данные в один диапазон и применяю линейную регрессию

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
test_df = scaler.transform(test_df)

display(X_train, X_test, test_df)

In [None]:
from sklearn.linear_model import LinearRegression

regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_predr = regressor.predict(X_test)
print(f'Оценка линейной регрессиии: {regressor.score(X_test, y_test)}')

**Метрики MSE, MAE, RMSE, R2R**

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 

mse = mean_squared_error(y_test, y_predr)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_predr, squared=False)
r2r = r2_score(y_test, y_predr)
print(f' MSE: {mse}\n MAE: {mae}\n RMSE: {rmse}\n R2: {r2r}')

**Кросс валидация по RMSE**

In [None]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(regressor, X, y, cv=5, scoring='neg_root_mean_squared_error')
print("Cross validation scores:\n\t", "\n\t".join("%.4f" % x for x in cv_scores))
print("Mean CV RMSE = %.4f" % np.mean(-cv_scores))

**Кросс валидация по R2**

In [None]:
cv_scoresr2 = cross_val_score(regressor, X, y, cv=5, scoring="r2")
print("Оценки кросс валидации:\n\t", "\n\t".join("%.4f" % x for x in cv_scoresr2))
print("Средняя кросс валидации R2 = %.4f" % np.mean(cv_scoresr2))

In [None]:
sns.heatmap(X.corr())

Здесь зависимость между фичами сильная (мултикореляционность)
Можем применить регуляризацию.

**Регуляризация Ridge**

In [None]:
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import GridSearchCV
alpha = np.logspace(-2, 3, 20)
modelR = Ridge()
searcher = GridSearchCV(modelR, [{'alpha': alpha}], scoring='neg_root_mean_squared_error', cv=5)
searcher.fit(X_train, y_train)

print('Лучшие параметры для Ridge: ', searcher.best_params_['alpha'])

In [None]:
best_model = Ridge(searcher.best_params_['alpha'])
best_model.fit(X_train, y_train)
best_y = best_model.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, best_y)
mae = mean_absolute_error(y_test, best_y)
rmse = mean_squared_error(y_test, best_y, squared=False)
r2 = r2_score(y_test, best_y)
print(f' MSE: {mse}\n MAE: {mae}\n RMSE: {rmse}\n R2: {r2}')

In [None]:
modelL = Lasso()
searcher = GridSearchCV(modelL, [{'alpha': alpha}], cv=5, scoring='neg_root_mean_squared_error')
searcher.fit(X_train, y_train)
best_alphaL = searcher.best_params_['alpha']
print(f'Лучший параметр для Lasso: {best_alphaL}')

In [None]:
print(searcher.best_estimator_)

In [None]:
modelL = Lasso(best_alphaL)
modelL.fit(X_train, y_train)

best_yL = modelL.predict(X_test)

In [None]:
mse = mean_squared_error(y_test, best_yL)
mae = mean_absolute_error(y_test, best_yL)
rmse = mean_squared_error(y_test, best_yL, squared=False)
r2 = r2_score(y_test, best_yL)
print("Метрики lasso")
print(f' MSE: {mse}\n MAE: {mae}\n RMSE: {rmse}\n R2: {r2}')

**Кросс валидация для Ridge**

In [None]:
cv_scoresR = cross_val_score(best_model, X, y, cv=5, scoring='r2')
print("Оценки кросс валидации Ridge:\n\t", "\n\t".join("%.4f" % x for x in cv_scoresR))
print("Средняя кросс валидации Ridge R2 = %.4f" % np.mean(cv_scoresR))

**Кросс валидация для Lasso** 

In [None]:
cv_scoresL = cross_val_score(modelL, X, y, cv=5, scoring='r2')
print("Оценки кросс валидации Lasso:\n\t", "\n\t".join("%.4f" % x for x in cv_scoresL))
print("Средння кросс валидации Lasso R2 = %.4f" % np.mean(cv_scoresL))

In [None]:
cv_scores_forest = cross_val_score(ranmodel, X, y, cv=5, scoring='r2')
print("Оценки кросс валидации Ranmodel:\n\t", "\n\t".join("%.4f" % x for x in cv_scores_forest))
print("Средння кросс валидации Lasso R2 = %.4f" % np.mean(cv_scores_forest))

In [None]:
cv_scores_xgb = cross_val_score(xgb_model, X, y, cv=5, scoring='r2')
print("Оценки кросс валидации Ranmodel:\n\t", "\n\t".join("%.4f" % x for x in cv_scores_xgb))
print("Средння кросс валидации Lasso R2 = %.4f" % np.mean(cv_scores_xgb))

Предсказание по тестовому файлу у которой не известны Y. 

In [None]:
target_y = ranmodel.predict(test_df)
target_y

In [None]:
target_y = xgb_model.predict(test_df)
target_y

In [None]:
output = pd.DataFrame({'Id': id_test,
                       'SalePrice': target_y})
output

In [None]:
output.to_csv('Predictionpriceregmodels.csv', index=False)