# Data Exploration Rental Prediction

### General Information

<p>
    The Business Usecase is:<br>
    <ul>
        <li>Prediced the Rental Pricing in Germany</li>
        <li>What kind of Featurs are the most importaned for a Rental Object</li>
        <li>How important is the Geo Location for the Rental Object</li>
    </ul>
</p>

### General Code

In [1]:
#Import 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
import urllib.parse
import time
from sklearn import preprocessing
from sklearn.preprocessing import MaxAbsScaler

In [2]:
#Load data 
path = "immo_data.csv"
df = pd.read_csv(path)
pd.set_option('display.max_columns', None)

In [3]:
#Functions for the Code

#creates a Correlation Matrix
def correlation_matrix(df):
    corr = df.corr()
    f, ax = plt.subplots(figsize=(20, 20))
    sns.heatmap(corr, square = True,fmt='.2f' ,annot = True)
    
#Creates a Mean for one Dataframe Attribute
def set_maker(df,attribute):
    array = df[attribute].unique()
    help_list = array.tolist()
    attribute_set = set(help_list)
    print(attribute_set)
    if type(help_list[1]) == int or float:
        print("The mean of this Attribute is {}".format(df[attribute].mean()))
    elif type(help_list[1]) == str:
        print("Sorry there is no mean to figute out.")

#Drops Inappropriate Data of one specific Value
def drop_inappropriate(df, attribute, value):
    i = df[df[attribute] == value].shape[0]
    print("There are {} inappropriate Data Points with the Value {}".format(i,value))
    df = df.drop(df[df[attribute] == value].index)
    print("The Datapiont have been Droped, the shape if the Dataframe is now {}".format(df.shape))
    return df

#Drops Inappropriate Data that is bigger than a specific Value
def drop_inappropriate_bigger(df, attribute, value):
    i = df[df[attribute] >= value].shape[0]
    print("There are {} inappropriate Data Points with the Value bigger than {}".format(i,value))
    df = df.drop(df[df[attribute] >= value].index)
    print("The Datapiont have been Droped, the shape if the Dataframe is now {}".format(df.shape))
    return df

#Drops Inappropriate Data that is bigger than a specific Value
def drop_inappropriate_smaller(df, attribute, value):
    i = df[df[attribute] <= value].shape[0]
    print("There are {} inappropriate Data Points with the Value smaller than {}".format(i,value))
    df = df.drop(df[df[attribute] <= value].index)
    print("The Datapiont have been Droped, the shape if the Dataframe is now {}".format(df.shape))
    return df

#Returns the all Column Names that have Categorial Data 
def get_categorical_cols(df):
    categorical_columns = []
    for cols in df.columns:
        if df[cols].dtype == 'object':
            categorical_columns.append(cols)
    return categorical_columns
        
#Creates a Dict with Weights and Names of  Prediction
def get_weight_dict(df, target_var, model):
    columns = []
    for cols in df.columns:
        columns.append(cols)
    columns.remove(target_var)
    if model == tree or random_forest:
        weights = model.feature_importances_
    else:
        weights = model.coef_
    weights_dictionary = dict(zip(columns, weights))
    return{k: v for k, v in sorted(weights_dictionary.items(), key=lambda item: item[1], reverse=True)}
 
#Makes a Histogram
def pd_hist(df,bins,attribute=False):
    if bool(attribute) == True:
        test_df = df[attribute]
        hist = test_df.hist(bins=bins,figsize=(20,20))
    else:
        hist = df.hist(bins=bins,figsize=(20,20))

#Feature Scales the hole Dataframe
def feature_scaling(df):
    scaler = MaxAbsScaler()
    # calculate the maximum absolute value
    scaler.fit(df)
    scaler.max_abs_
    # transform the data using the parameters calculated by the fit method
    scaled_data = scaler.transform(df)
    df_scaled = pd.DataFrame(scaled_data, columns=df.columns)
    return df_scaled

#Changes Categorical Data to Numbers
def change_categorical(df):
    cat = get_categorical_cols(df)
    cat_df = df[cat]
    df = df.drop(columns=cat)
    cat_df = cat_df.apply(preprocessing.LabelEncoder().fit_transform)
    df =  pd.concat([df, cat_df], axis=1)
    #df = df.append(cat_df, ignore_index=True)
    #f_all = [df, cat_df]    , ignore_index=True
    #esult = pd.concat(df_all)
    return df

#
def displot(df, x, y):
    sns.displot(data=df, x=df[y], col=df[x], kde=True)
    plt.show()
   

### First Look into the Data

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
correlation_matrix(df)

## 1. Data Cleaning

### Handeling Null Data

In [None]:
#Inspect how much Null Data there is
df.isna().sum()/len(df)

In [None]:
#See how many Attributes have more than 50% Null Data
df.columns[((df.isna().sum()/len(df)) > 0.50)]

In [None]:
#Looking deeper into each Attribute with more than 50% Null Data
    #set_maker(df,"noParkSpaces")
    #df['noParkSpaces'].value_counts()

<br>
<ul>
    <li>telekomHybridUploadSpeed = 83% null (to much null to set with mean and no real corelation to price)
    <li>noParkSpaces = 65% (Will be replaced with 0)
    <li>heatingCosts = 68% (Is know from previos owners and will be sett to mean because in a test is brought 1-2% of accuracy)
    <li>energyEfficiencyClass = 71% (To little Corelation and to many missing)
    <li>lastRefurbish = 70% (A lot of wrong Data and a hight rate if Nan will be cut)
    <li>electricityBasePrice = 82% (Sind seit 2020 veraltet, und viele Fehlende Daten, will be cut)
    <li>electricityKwhPrice = 82% (Cut out, small range of Values and to many Nan)
</ul>
<p> The only Attribute that stays from the list will be no of Parking Spaces
    <br>

In [None]:
#drops Colums with to much Null
df = df.drop(columns=df.columns[((df.isna().sum()/len(df)) > 0.69)])
df.columns

### Handeling inappropriat Data

In [None]:
#since No. of Parkingspaces are importat for a Arppartmet I think they would put them in the offering
#seting noParkings. from Nan to 0
df["noParkSpaces"] = df["noParkSpaces"].fillna(value=0)

In [None]:
#drop Data pionts in "baseRent" with the Values 0.0
df = drop_inappropriate(df,"baseRent", 0.0)

In [None]:
#drop Data pionts in "livingSpace" with the Values 0.0
df = drop_inappropriate(df,"livingSpace", 0.0)

In [None]:
#drop Data pionts in "floor" with the Values bigger than 50
#there are no rental objects higher than 50 floors
df = drop_inappropriate_bigger(df,"floor", 50)

In [None]:
#drop Data pionts in "numberOfFloors" with the Values bigger than 50
#there are no rental objects higher than 50 floors
df = drop_inappropriate_bigger(df,"numberOfFloors", 50)

In [None]:
#drop Data pionts in "noParkSpaces" with the Values bigger than 30
df = drop_inappropriate_bigger(df,"noParkSpaces", 30)

In [None]:
df = drop_inappropriate_bigger(df,"serviceCharge", 1000)
df = drop_inappropriate_smaller(df,"serviceCharge", 0)

In [None]:
#To big for the Client
df = drop_inappropriate_bigger(df,"livingSpace", 500)

In [None]:
#To big for the Client
df = drop_inappropriate_bigger(df,"noRooms", 20)

In [None]:
#drop Data pionts in "baseRent" with the Values bigger than 10.000€
#To expencive for the Client
df = drop_inappropriate_bigger(df,"baseRent", 10000)

In [None]:
#not realistic
df = drop_inappropriate_bigger(df,"heatingCosts", 300)

In [None]:
#To old for the Client
df = drop_inappropriate_smaller(df,"yearConstructed", 1900) #1900
df = drop_inappropriate_bigger(df,"yearConstructed", 2025) #2025

In [None]:
#it is important on witch floor the flat is as long as it's not on 0 (ground Floor) so every Nan will be replaced with 0 
#because taking a mean or median would not make sence
#some people have put 1 for the ground floor and some 0 soo? What to do?
df["floor"] = df["floor"].fillna(value=0)

In [None]:
#will be changes to numbers so all Methods can work with it
df["petsAllowed"] = df["petsAllowed"].fillna(value='no')
df["petsAllowed"] = df["petsAllowed"].replace(['no'],0)
df["petsAllowed"] = df["petsAllowed"].replace(['negotiable'],5)
df["petsAllowed"] = df["petsAllowed"].replace(['yes'],10)

### Drop not usefull Data

<h4> Explanation why they are not Usefull</h4>
    <ul>
        <li>description = No need because there are no direct information on the House Price, could be explord in side Project
        <li>livingSpaceRange = Is reprecented in other Attributes.
        <li>scoutId = Just a Immoscout intern ID
        <li>street = Not importat
        <li>streetPlain = Same as Steet Name
        <li>houseNumber = Not important
        <li>date = Just the scraping date.
        <li>facilities = Same as Description
        <li>totalRent = is just a combination of Rent and serviceCharges, still has no correlation
        <li>telekomUploadSpeed = Has no Corralation and makes no sens to keep everyone can choose the Speed they want to pay for

In [None]:
#drop Attributes that are unimportant 
df = df.drop(columns=['description','livingSpaceRange','scoutId','street','streetPlain','houseNumber','date','facilities','totalRent','telekomUploadSpeed'])

### Fill Numeric Null with mean

In [None]:
#Inspect the mean of the Data
df._get_numeric_data().mean()

In [None]:
#fill the Nan with mean
df.fillna(df._get_numeric_data().mean(),inplace = True)

In [None]:
#Check the remaining Null Values
df._get_numeric_data().mean()

### The Categorical Data will be handelt in the Prediction part because it's better to Visualiz this way

## 2. Data Visualization

### General Visualization

In [None]:
pd_hist(df,100)

In [None]:
correlation_matrix(df)

In [None]:
df.columns

### Development of rental prices in relation to the variables with the highest correlations

<ul>
    <li>Living Space 
    <li> Service Charge 

In [None]:
sns.regplot(x='livingSpace', y='baseRent', data=df, scatter_kws={"alpha":0.3,"s":20})

In [None]:
sns.regplot(x = 'serviceCharge', y= 'baseRent', data=df, scatter_kws={"alpha":0.3,"s":20})

### Rental prices in comparison of the German states

In [None]:
plt.figure(figsize=(20,10))

sns.barplot(x=df.regio1, y=df.baseRent, ci = None)
plt.ylim([25, 1250])
plt.xticks(rotation='vertical')
plt.show()

## 3. Data Prediction

### Handeling Categorical Data

In [None]:
#Changes the Categorical Data to Numbers
df = change_categorical(df) 

### Additional (specialy needed for KNN)

In [None]:
#replaces bool with numbers otherwise the featurescaling would not work
bool_list = ["newlyConst", "balcony", "hasKitchen", "cellar", "lift" , "garden" ]
for x in bool_list:
    df[x] = df[x].astype(int)

In [None]:
df = feature_scaling(df)

### General prediction preparation

In [None]:
#import the sklearn libarys 
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
from sklearn.metrics import f1_score
import math

In [None]:
X=df.copy()
y = df['baseRent']  #np.log() #with the log the prediction score gets better, but we found no usecase for that

X.drop(['baseRent'],axis=1,inplace = True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle= True)

### Linear Models

In [None]:
lineReg = LinearRegression()
lineReg.fit(X_train, y_train)

prediction = lineReg.predict(X_test)
#f1_score = f1_score(y_test, prediction, average='micro')

print('Score: ', lineReg.score(X_test, y_test))
#print('F1 Score: ', f1_score)


plt.plot(predict)
plt.show()
plt.plot(y_test)
plt.show()

print("Weights:")
get_weight_dict(df, 'baseRent', lineReg)

In [None]:
reg = linear_model.Ridge (alpha = .001)
reg.fit(X_train, y_train)
print('Score: ', reg.score(X_test, y_test))


plt.plot(reg.predict(X_test))
plt.plot(y_test)
plt.show()

print("Weights:")
get_weight_dict(df, 'baseRent', reg)

In [None]:
SGD = linear_model.SGDRegressor(alpha = 0.001)
SGD.fit(X_train, y_train)
print('Score: ', SGD.score(X_test, y_test))

plt.plot(SGD.predict(X_test))
plt.plot(y_test)
plt.show()

print("Weights:")
get_weight_dict(df, 'baseRent', SGD)

### Tree Models

In [None]:
tree = DecisionTreeRegressor(max_depth=10)
tree.fit(X_train, y_train)

print('Score: ', tree.score(X_test, y_test))


plt.plot(tree.predict(X_test))
plt.plot(y_test)
plt.show()

print("Weights:")
get_weight_dict(df, 'baseRent',tree)

In [None]:
random_forest = RandomForestRegressor(max_depth=15, random_state=0)
random_forest.fit(X_train, y_train)
print('Score: ', random_forest.score(X_test, y_test))

plt.plot(random_forest.predict(X_test))
plt.plot(y_test)
plt.show()

print("Weights:")
get_weight_dict(df, 'baseRent',random_forest)

### KNN Model

In [None]:
knn_model = KNeighborsRegressor(n_neighbors=3)
knn_model.fit(X_train, y_train)

print('Score: ', knn_model.score(X_test, y_test))


plt.plot(knn_model.predict(X_test))
plt.plot(y_test)
plt.show()

## 4. Map Visualization

In [None]:
#Load Location Dataframe
path_plz = "plz_geocoord.csv"
df_plz = pd.read_csv(path_plz)
pd.set_option('display.max_columns', None)
df_plz.columns = ['plz', 'lat', 'lon']
df_plz = df_plz.set_index('plz')

In [None]:
t0 = time.time()

df = df.merge(df_plz, how='inner', on='geo_plz')

t1 = time.time()
total = t1-t0

print("It took {} Sec. to get {} Lat and Lon Data Pionts".format(total,df.shape[0]))

In [None]:
import geopandas as gpd
fp = "geomap/vg2500_bld.shp"
map_df = gpd.read_file(fp)
map_df.head()
map_df.plot()

In [None]:
plt.scatter(x=df['lon'], y=df['lat'])
plt.show()