In [None]:
import mpl_toolkits
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

%matplotlib inline
import matplotlib.pyplot as plt  # Matlab-style plotting
import seaborn as sns
color = sns.color_palette()
sns.set_style('darkgrid')

import warnings
def ignore_warn(*args, **kwargs):
    pass
warnings.warn = ignore_warn #ignore annoying warning (from sklearn and seaborn)


from scipy import stats
from scipy.stats import norm, skew #for some statistics

## Import Data

In [None]:
train=pd.read_csv("/kaggle/input/tunisian-house-prices-prediction/train.csv")
test=pd.read_csv("/kaggle/input/tunisian-house-prices-prediction/test.csv")
Sample_submission=pd.read_csv("/kaggle/input/tunisian-house-prices-prediction/Sample_submission.csv")

In [None]:
test.head()

In [None]:
train.info()

The train Set has a large number of mssing values that we must deal with. We should probably remove the age feature because 
it contains a lot of missing numbers.

In [None]:
train.describe()

From here, we may identify certain outliers in several features such as distance to capital and number of rooms and bathrooms that may provide incorrect information to our model. 
We may require certain plots to discover and deal with outliers.

## Data visualization 

### Dealing with outliers

In [None]:
train.boxplot(column =['price_tnd'], grid = False)

In [None]:
# Position of the Outlier
print((np.where(train['price_tnd']>0.200000e+07))[0].shape)

In [None]:
# Position of the Outlier
print((np.where(train['price_tnd']<0.50000e+05))[0].shape)

In [None]:
#Deleting outliers
#Drop_X_train1 = train_set.drop(train_set[(train_set['price_tnd']>0.2500000e+07)].index)
Drop_X_train1 = train.drop(train[(train['price_tnd']<0.500000e+05)].index)

We note the target's mal distribution, but we can't delete any values above 0.2 e+07 because these values can exist in the real Tunisian market,
but we can delete  those less than 0.5e+05.

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = Drop_X_train1['distance_to_capital'], y = Drop_X_train1['price_tnd'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('distance_to_capital', fontsize=13)
plt.show()


If you want to travel by automobile, the distance between Tunis and Tataouine (Tunisia's most distant city) is 536.8 kilometers. Any value greater than this is considered an anomaly.

In [None]:
#Deleting outliers
Drop_X_train1 = Drop_X_train1.drop(Drop_X_train1[(Drop_X_train1['distance_to_capital']>600) ].index)

#Check the graphic again

fig, ax = plt.subplots()
ax.scatter(x = Drop_X_train1['distance_to_capital'], y = Drop_X_train1['price_tnd'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('distance_to_capital', fontsize=13)
plt.show()


In [None]:
fig, ax = plt.subplots()
ax.scatter(x = Drop_X_train1['room'], y = Drop_X_train1['price_tnd'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('room', fontsize=13)
plt.show()

In [None]:
#Deleting outliers
Drop_X_train1 = Drop_X_train1.drop(Drop_X_train1[(Drop_X_train1['room']>14)  & (Drop_X_train1['price_tnd']<0.50000e+07)].index)

fig, ax = plt.subplots()
ax.scatter(x = Drop_X_train1['room'], y = Drop_X_train1['price_tnd'])
plt.ylabel('SalePrice', fontsize=13)
plt.xlabel('room', fontsize=13)
plt.show()

In [None]:
fig, ax = plt.subplots()
ax.scatter(x = Drop_X_train1['bathroom'], y = Drop_X_train1['room'])
plt.ylabel('room', fontsize=13)
plt.xlabel('bathroom ', fontsize=13)
plt.show()

A house's number of rooms and bathrooms should be proportional. So we try to discover outliers using these graph. For example, instances where the number of bathrooms exceeds the number of rooms.

In [None]:
#Deleting outliers
Drop_X_train1 = Drop_X_train1.drop(Drop_X_train1[(Drop_X_train1['room']<Drop_X_train1['bathroom']+1)].index)
Drop_X_train1 = Drop_X_train1.drop(Drop_X_train1[(Drop_X_train1['room']>10)  & (Drop_X_train1['bathroom']<5)].index)



In [None]:
fig, ax = plt.subplots()
ax.scatter(x = Drop_X_train1['pieces'], y = Drop_X_train1['room']+Drop_X_train1['bathroom'])
plt.ylabel('room+bathroom', fontsize=13)
plt.xlabel('pieces ', fontsize=13)
plt.show()

In [None]:
Drop_X_train1.boxplot(column =['Area'], grid = False)

Actually, there are a lot of outliers in the Area feature, therefore I believe that eliminating them will result in a significant loss of data. As a result, we consider another option. and categorize the Area feature.

In [None]:
Drop_X_train1["Area_Cat"] = pd.cut(Drop_X_train1["Area"],
 bins=[0.,100, 250,500, 1000, np.inf],
 labels=[1, 2, 3,4,5])

In [None]:
Drop_X_train1["Area_Cat"].hist()

### Handling Text and Categorical Attributes

In [None]:
# Get list of categorical variables
s = (Drop_X_train1.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)

In [None]:
Drop_X_train1.describe(include="O")

In [None]:
print(Drop_X_train1.city.nunique())
print(Drop_X_train1.city.unique())

##### Location

In [None]:
print(Drop_X_train1.location.nunique())
print(Drop_X_train1.location.unique())

In [None]:
plt.figure(figsize=(20, 10))
total = float(len(Drop_X_train1["location"]) )
ax=sns.countplot(data = Drop_X_train1, y = 'location', order = Drop_X_train1.location.value_counts().index[:60])
for p in ax.patches:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 0.02
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y))

plt.show()
plt.figure(figsize=(20,8))


In [None]:
location_price = Drop_X_train1.groupby(['location'])['price_tnd'].mean()
location_price = location_price.sort_values(ascending=False)

for i in range(0, len(location_price), 40):
    subset = location_price.iloc[i:i + 40]
    sns.catplot(data=Drop_X_train1[Drop_X_train1.location.isin(subset.index)], x="price_tnd", y="location", order=subset.index, jitter=False)


This graph appears to be very interesting. It demonstrates the significance of location in this situation. Tunisia can be divided into three types of living environments: luxurious, normal, and low-value. However, there are some locations with a low number of occurrences and one or two homes with high prices that may obstruct this results. As a consequence, using the mean and median to categorize these sites is not the ideal method. So I experimented with the median multiplied by the number of cases.

In [None]:
location_price = Drop_X_train1.groupby(['location'])['price_tnd'].describe()[['25%', '50%', '75%']]
location_counts = Drop_X_train1['location'].value_counts()
location_price['counts'] = location_counts
location_price['combined'] = location_price['50%'] * location_price['counts']
location_price = location_price.sort_values('combined', ascending=False)

for i in range(0, len(location_price), 40):
    subset = location_price.iloc[i:i + 40]
    sns.catplot(data=Drop_X_train1[Drop_X_train1.location.isin(subset.index)], x="price_tnd", y="location", order=subset.index, jitter=False)



That appears to be really exciting. As you can see below, the first sites are well-known in Tunisia for its luxurious and high-valued properties, therefore we may encode this attribute by bining him into four bines. Luxurious places, high-value locales, average sites, and low-value locations.

In [None]:
Drop_X_train1[Drop_X_train1["location"]=="Ksar Hellal"]

a home in Monastir, Ksar Hlel for 19000000 TND. I initially thought it was an outlier. However, I checked the internet and it does exist in mubaweb.tn. However, I believe we should see the target distribution.
- We may also see the Nan Values throughout the city. In this case, I think we may replace the city with a governorate or a location. We can also remove this feature because we already have the location and the governorate.

##### governorate

In [None]:
#category column
print(Drop_X_train1.governorate.nunique())
print(Drop_X_train1.governorate.unique())

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(data = Drop_X_train1, y = 'governorate', order = Drop_X_train1.governorate.value_counts().index)

In [None]:
mean_price = Drop_X_train1.groupby(['governorate'])['price_tnd'].mean()
location_counts = Drop_X_train1.governorate.value_counts()
mean_price_times_location_counts = mean_price * location_counts
mean_price_times_location_counts = mean_price_times_location_counts.sort_values(ascending=False)
sns.catplot(data=Drop_X_train1, x="price_tnd", y="governorate",  order = mean_price_times_location_counts.index, jitter=False)

The governorate feature is an important categorical variable that can have a significant impact on the price of the house. Target Encoding can be used to capture the relationship between the governorate and the house price by replacing the categorical values with their average target value (house price in this case). This encoding method allows the model to capture the information contained in the categorical variables while maintaining the interpretability of the model.

Overall, target encoding is an effective method to handle categorical variables and improve the predictive performance of a model. By considering both the frequency of a governorate and its average target value, target encoding provides a more robust representation of the relationship between the governorate and the house price, which can improve the accuracy of the prediction.

In [None]:
#category column
print(Drop_X_train1.age.nunique())
print(Drop_X_train1.age.unique())

In [None]:
mean_price_age = Drop_X_train1.groupby(['age'])['price_tnd'].mean()
age_counts = Drop_X_train1.age.value_counts()
mean_price_age_counts = mean_price_age * age_counts
mean_price_age_counts = mean_price_age_counts.sort_values(ascending=False)
sns.catplot(data=Drop_X_train1, x="price_tnd", y="age",  order = mean_price_age_counts.index, jitter=False)

We may divide this feature into new and old houses, but the age feature has 2716 instances. So we'll have to experiment with deleting and encoding this functionality to determine which produces the most effective results.

#### Features distribution and missing values

In [None]:
# Get names of columns with missing values
cols_with_missing = [col for col in Drop_X_train1.columns
                     if Drop_X_train1[col].isnull().any()]
print(Drop_X_train1.isnull().sum())

In [None]:
# !pip install missingno
import missingno as msno
import matplotlib.pyplot as plt



In [None]:
# Visualize missing values using missingno
msno.matrix(Drop_X_train1)
plt.show()

In [None]:
msno.heatmap(Drop_X_train1, cmap='YlGnBu')
plt.show()


In [None]:
columns=['state','garage', 'garden', 'concierge', 'beach_view',
       'mountain_view', 'pool', 'elevator', 'furnished', 'equipped_kitchen',
       'central_heating', 'air_conditioning']
Drop_X_train1[columns].hist(figsize=(16,10));

This binary variable shows the presence of additional features in the house. We should remove the garden because it includes just zeros. We may also include a characteristic that is the sum of all of these features. Depending on the value of the feature for the model, we may add a coeficient to each of them depending on the importance of this feature for the model.

In [None]:
# Create a figure and axis for the plot
fig, ax = plt.subplots(4, 3, figsize=(18, 15))

# Flatten the axis array to make it easier to loop through
ax = ax.flatten()

# Define the variables of interest
variables = ['state', 'garage', 'garden', 'concierge', 'beach_view', 'mountain_view', 'pool', 'elevator', 'furnished', 'equipped_kitchen', 'central_heating', 'air_conditioning']

# Loop through each variable and plot its effect on price_tnd
for i, var in enumerate(variables):
    sns.boxplot(x=var, y="price_tnd", data=Drop_X_train1, ax=ax[i])

# Display the plot
plt.show()


We can't see the distibution of the target correctly.. 

#### Target Variable

In [None]:
sns.distplot(Drop_X_train1['price_tnd'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(Drop_X_train1['price_tnd'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(Drop_X_train1['price_tnd'], plot=plt)
plt.show()

As we can see, the target distribution is biased to the left. One approach in this instance is to use the Log function on the target. To avoid problems with 0 variables, we used Log(X+1).

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
Drop_X_train1["price_tnd"] = np.log1p(Drop_X_train1["price_tnd"])

#Check the new distribution 
sns.distplot(Drop_X_train1['price_tnd'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(Drop_X_train1['price_tnd'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))

#Now plot the distribution
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('SalePrice distribution')

#Get also the QQ-plot
fig = plt.figure()
res = stats.probplot(Drop_X_train1['price_tnd'], plot=plt)
plt.show()

In [None]:
Drop_X_train1.boxplot(column =['price_tnd'], grid = False)

In [None]:
# Create a figure and axis for the plot
fig, ax = plt.subplots(4, 3, figsize=(18, 15))

# Flatten the axis array to make it easier to loop through
ax = ax.flatten()

# Define the variables of interest
variables = ['state', 'garage', 'garden', 'concierge', 'beach_view', 'mountain_view', 'pool', 'elevator', 'furnished', 'equipped_kitchen', 'central_heating', 'air_conditioning']

# Loop through each variable and plot its effect on price_tnd
for i, var in enumerate(variables):
    sns.boxplot(x=var, y="price_tnd", data=Drop_X_train1, ax=ax[i])

# Display the plot
plt.show()


This is far better. We can now see how important each feature is.

- **state**: the state of a listing provided in 3 different values; 1 indicating it's in a normal state, 2 indicating it requires renovation and 0 indicating it's brand new. We can observe that features with status 0 have a higher price.
- **Pool**: the pool also appears interesting, as he informs us about the cost of this house luxury. 
- Also included are a equipped kitchen, central heating, air conditioning, furniture, and a garage, all of which are of varying value. 
- However, we can see that mountain view, elevator, and concierge are not providing us with useful information about the house price.

We may generate a total of these features by assigning a coefficient to each feature to indicate its relevance to the target.

In [None]:
sns.boxplot(x="Area_Cat", y="price_tnd", data=Drop_X_train1)
plt.show()

As you can see, the area category provides an excellent interaption to the model for the price of the property.

In [None]:
#Correlation map to see how features are correlated with SalePrice
corrmat = Drop_X_train1.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

In [None]:
# most correlated features
corrmat = Drop_X_train1.corr()
top_corr_features = corrmat.index[abs(corrmat["price_tnd"])>0.15]
plt.figure(figsize=(10,10))
g = sns.heatmap(Drop_X_train1[top_corr_features.drop("Id")].corr(),annot=True,cmap="RdYlGn")

In [None]:
Drop_X_train1.columns

In [None]:
Num_features=['Area', 'pieces','room', 'bathroom','price_tnd']

In [None]:
Drop_X_train1.plot(kind="scatter", x="long", y="latt", alpha=1,figsize=(6,8),
 c="price_tnd", cmap=plt.get_cmap("jet"), colorbar=True)

This looks like Map of Tunisia  all right, but other than that it is hard to see any particular
pattern.


**Let's put an end to visulasiation. It is now time to process the data.**

## Data Cleaning 

In [None]:
train_copy = train.copy()
test_copy = test.copy()

In [None]:
test.columns

#### Deleting outliers

In [None]:
# train_copy = train_copy.drop(train_copy[(train_copy['room']>14)  & (train_copy['price_tnd']<0.50000e+07)].index)
# train_copy = train_copy.drop(train_copy[(train_copy['room']<train_copy['bathroom']+1)].index)
# train_copy = train_copy.drop(train_copy[(train_copy['room']>10)  & (train_copy['bathroom']<5)].index)


#### Missing values

In [None]:
# Drop the age feature
train_copy.drop("age",axis=1,inplace=True)

In [None]:
test_copy.drop("age", inplace=True, axis=1 )

In [None]:
# Drop the instances with missing values in each of latt,Long and Area
train_copy=train_copy.dropna(subset=['latt','long','Area'])

In [None]:
# Replace missing values in the city feature with the corresponding location
train_copy['city'].fillna(train_copy['location'], inplace=True)


##### fills missing values in the "pieces", "room", and "bathroom" features by imputing the mean value of the corresponding feature for each unique value of the "Area" feature. 

In [None]:
unique_areas = train_copy['Area'].unique()

In [None]:
pieces_mean = {}
rooms_mean = {}
bathrooms_mean = {}

for area in unique_areas:
    area_df = train_copy[train_copy['Area'] == area]
    pieces_mean[area] = area_df['pieces'].mean()
    rooms_mean[area] = area_df['room'].mean()
    bathrooms_mean[area] = area_df['bathroom'].mean()

In [None]:
for index, row in train_copy.iterrows():
    if np.isnan(row['pieces']):
        train_copy.at[index, 'pieces'] = pieces_mean[row['Area']]
    if np.isnan(row['room']):
        train_copy.at[index, 'room'] = rooms_mean[row['Area']]
    if np.isnan(row['bathroom']):
        train_copy.at[index, 'bathroom'] = bathrooms_mean[row['Area']]
    

In [None]:
for index, row in train_copy.iterrows():
    if np.isnan(row['pieces']):
        train_copy.at[index, 'pieces'] = rooms_mean[row['Area']]
    if np.isnan(row['room']):
        train_copy.at[index, 'room'] = pieces_mean[row['Area']]
    if np.isnan(row['bathroom']):
        train_copy.at[index, 'bathroom'] = rooms_mean[row['Area']]/2
    

In [None]:
train_copy['bathroom'] = train_copy['bathroom'].round()
train_copy['room'] = train_copy['room'].round()
train_copy['pieces'] = train_copy['pieces'].round()

##### Using K-means to fill the missing values of state

In [None]:
from sklearn.cluster import KMeans

# Fill in the missing values in the state feature with the mean value of the feature
train_copy["state"].fillna(value=train_copy["state"].mean(), inplace=True)

# Train the K-means model on the state feature
kmeans = KMeans(n_clusters=3).fit(train_copy["state"].values.reshape(-1,1))

# Predict the cluster for each instance
train_copy["state"] = kmeans.predict(train_copy["state"].values.reshape(-1,1))



In [None]:
train_copy=train_copy.fillna(method="ffill")

In [None]:
train_copy["state"].unique()

In [None]:
train_copy.info()

#### Categorical variables

In [None]:
location_price = train_copy.groupby(['location'])['price_tnd'].describe()[['25%', '50%', '75%']]
location_counts = train_copy['location'].value_counts()
location_price['counts'] = location_counts
location_price['combined'] = location_price['50%'] * location_price['counts']
location_price = location_price.sort_values('combined', ascending=False)






# Add the mean price to the test dataframe
train_copy['location_encoded'] = train_copy['location'].map(location_price['combined'])
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train_copy["location_encoded"] = np.log1p(train_copy['location_encoded'])



# # Create a numerical value to represent the order of the locations
# location_order = location_price['combined'].argsort().argsort()

# # Define the bin edges
# bin_edges = [0, 101, 151, 201 , 225, 245, 265, 285,300,315, 325, 332,338,341, 342]

# # Create the bins based on the order of the locations
# location_price['bins'] = pd.cut(location_order, bin_edges, labels=False, right=False)



# # Create a mapping from location to bin number
# location_to_bin = dict(zip(location_price.index, location_price['bins']))

# # Map the locations in the original dataframe to their respective bins
# train_copy['location_bin'] = train_copy['location'].map(location_to_bin)


In [None]:
mean_price = train_copy.groupby(['governorate'])['price_tnd'].mean()
location_counts = train_copy.governorate.value_counts()
mean_price_times_location_counts = mean_price * location_counts
mean_price_times_location_counts = mean_price_times_location_counts.sort_values(ascending=False)


# Add the mean price to the test dataframe
train_copy['governorate_encoded'] = train_copy['governorate'].map(mean_price_times_location_counts)
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train_copy["governorate_encoded"] = np.log1p(train_copy["governorate_encoded"])

In [None]:
City_price = train_copy.groupby(['city'])['price_tnd'].describe()[['25%', '50%', '75%']]
City_counts = train_copy['city'].value_counts()
City_price['counts'] = City_counts
City_price['combined'] = City_price['50%'] * City_price['counts']
City_price = City_price.sort_values('combined', ascending=False)

# # Create a numerical value to represent the order of the locations
# city_order = City_price['combined'].argsort().argsort()

# # Define the bin edges
# bin_edges = [0,80,105,120,130,135,138,139]

# # Create the bins based on the order of the locations
# City_price['bins'] = pd.cut(city_order, bin_edges, labels=False, right=False)


# Add the mean price to the test dataframe
train_copy['city_encoded'] = train_copy['city'].map(City_price['combined'])
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train_copy["city_encoded"] = np.log1p(train_copy["city_encoded"])

In [None]:
# # Create a mapping from location to bin number
# city_to_bin = dict(zip(City_price.index, City_price['bins']))

# # Map the locations in the original dataframe to their respective bins
# train_copy['city_bin'] = train_copy['city'].map(city_to_bin)

# # Create a mapping from location to bin number
# city_to_bin = dict(zip(City_price.index, City_price['bins']))

# # Map the locations in the original dataframe to their respective bins
# train_copy['city_bin'] = train_copy['city'].map(city_to_bin)


In [None]:
bad_cols=['governorate','location','city']
train_copy.drop(bad_cols, inplace=True, axis=1 )

In [None]:
# Add the mean price to the test dataframe
test_copy['location_encoded'] = test_copy['location'].map(location_price['combined'])
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
test_copy["location_encoded"] = np.log1p(test_copy['location_encoded'])


# Add the mean price to the test dataframe
test_copy['governorate_encoded'] = test_copy['governorate'].map(mean_price_times_location_counts)
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
test_copy["governorate_encoded"] = np.log1p(test_copy["governorate_encoded"])

# Add the mean price to the test dataframe
test_copy['city_encoded'] = test_copy['city'].map(City_price['combined'])
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
test_copy["city_encoded"] = np.log1p(test_copy["city_encoded"])



In [None]:
test_copy.drop(bad_cols, inplace=True, axis=1 )

In [None]:
test_copy.info()

#### Feature engineering 

In [None]:
train_copy.drop("garden",inplace=True, axis=1)


In [None]:
train_copy["Area_Cat"] = pd.cut(train_copy["Area"],
 bins=[0.,100, 250,500, 1000, np.inf],
 labels=[1, 2, 3,4,5])

In [None]:
#We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train_copy["price_tnd"] = np.log1p(train_copy["price_tnd"])

In [None]:
test_copy.drop("garden",inplace=True, axis=1)
test_copy["Area_Cat"] = pd.cut(test_copy["Area"],
 bins=[0.,100, 250,500, 1000, np.inf],
 labels=[1, 2, 3,4,5])


In [None]:
Plus_features = ['garage', 'beach_view','mountain_view',
        'pool',  'furnished', 'equipped_kitchen',
       'central_heating', 'air_conditioning']
test_copy["Plus_features"] = test_copy[Plus_features].sum(axis=1)
test_copy["Plus_features"] = test_copy["Plus_features"]-test_copy["state"]
test_copy[Plus_features + ["Plus_features"]].head(10)


In [None]:
Plus_features = ['garage', 'beach_view','mountain_view',
        'pool',  'furnished', 'equipped_kitchen',
       'central_heating', 'air_conditioning']
train_copy["Plus_features"] = train_copy[Plus_features].sum(axis=1)
train_copy["Plus_features"] = train_copy["Plus_features"]-train_copy["state"]
train_copy[Plus_features + ["Plus_features"]].head(10)

##### Scaling  numerical features

In [None]:
train_copy.columns

In [None]:
numeric_feats = train_copy.dtypes[train_copy.dtypes != "object"].index

# Check the skew of all numerical features
skewed_feats = train_copy[numeric_feats].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
print("\nSkew in numerical features: \n")
skewness = pd.DataFrame({'Skew' :skewed_feats})
skewness.head(20)

In [None]:
# Get the skewness values
skewed_feats = train_copy[numeric_feats].apply(lambda x: skew(x.dropna()))

# Create a histogram of the skewness values
sns.histplot(data=skewed_feats,bins=50, kde=True)

In [None]:
skewness = skewness[abs(skewness["Skew"]) > 1]
print("There are {} skewed numerical features to Box Cox transform".format(skewness.shape[0]))

from scipy.special import boxcox1p
skewed_features = skewness.index

In [None]:
lam = 0.15
for feat in skewed_features:
    #all_data[feat] += 1
    train_copy[feat] = boxcox1p(train_copy[feat], lam)
    test_copy[feat] = boxcox1p(test_copy[feat], lam)
    
#all_data[skewed_features] = np.log1p(all_data[skewed_features])

In [None]:
train_copy

In [None]:
# Num_Col=[ 'Area', 'latt','long', 'distance_to_capital', 'diag_coord']
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# train_copy[Num_Col]=sc.fit_transform(train_copy[Num_Col])
# test_copy[Num_Col]=sc.fit_transform(test_copy[Num_Col])

In [None]:
test_copy

In [None]:
#Correlation map to see how features are correlated with SalePrice
corrmat = train_copy.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

In [None]:
# most correlated features
corrmat = train_copy.corr()
top_corr_features = corrmat.index[abs(corrmat["price_tnd"])>0.15]
plt.figure(figsize=(10,10))
g = sns.heatmap(train_copy[top_corr_features.drop("Id")].corr(),annot=True,cmap="RdYlGn")

In [None]:
target_col = "price_tnd"

# Get the correlation coefficients between each feature and the target variable
correlations = train_copy.corr()[target_col]

# Drop the target variable from the list of features
correlations = correlations.drop(target_col)

# Print the list of feature names and their correlation coefficients with the target
print("Correlations with target:\n")
for feat, corr in correlations.items():
    print(f"{feat}: {corr:.2f}")