## Basic Visualizations
Performing some basic statistics on the dataset to get a better understanding of it. The results are on the preprocessed data.

In [None]:
import pandas as pd
import numpy as np 
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
dataset = pd.read_csv("../data/assignment_preprocessed.csv")
columns = dataset.columns

In [None]:
dataset.head(5)

In [None]:
print('Shape of Dataset:', dataset.shape)

In [None]:
dataset.dtypes

In [None]:
# check how many entries are empty in a specific column
dataset.isnull().sum()

In [None]:
# count the number of duplicate entries while excluding the unique id and the agent_id
dataset[dataset.columns.difference(['id', 'agent_id'])].duplicated().sum()

In [None]:
dataset.describe()

In [None]:
# get number of unique values per column
dataset.nunique()

In [None]:
# print(dataset.isnull().sum())
null_percentage = dataset.isnull().sum() / len(dataset) * 100
# create a list of columns with more than 80% null values
columns_to_drop = null_percentage[null_percentage > 80].index.tolist()
# drop the columns with more than 80% null values
dataset.drop(columns_to_drop, axis=1, inplace=True)
# drop rows where data is missing from specific columns
dataset.dropna(subset=columns[0:10].append(columns[13:18]), inplace=True)

In [None]:
dataset.iloc[7378]

In [None]:
dataset[dataset.columns.difference(['id', 'agent_id', 'ranking_score'])].select_dtypes(include=np.number)

In [None]:
dataset.select_dtypes(include=np.number)

In [None]:
dataset.drop([262])

In [None]:
# define a threshold for z-score
threshold = 3

# loop through each numeric column
for col in dataset.select_dtypes(include=np.number):
    print(col)
    # calculate z-score for each value in the column
    z = np.abs(stats.zscore(dataset[col]))
    # identify outliers based on the threshold
    outliers = list(np.where(z > threshold))
    # drop rows containing outliers
    dataset.drop(outliers[0], axis=0, inplace=True)

# save the cleaned dataset
# dataset.to_csv('cleaned.csv', index=False)

In [None]:
dataset.describe()

In [None]:
dataset['subtype'].value_counts()

In [None]:
# bar chart of counts for subtypes of houses
ax = dataset['subtype'].value_counts().plot.barh(figsize=(8, 8))
ax.bar_label(ax.containers[0])
plt.xlabel("Count", labelpad=12)
plt.ylabel("Subtypes", labelpad=12)
plt.title("Count of Listings of each Subtype", y=1.02);

In [None]:
# bar chart of counts for subtypes of houses
ax = dataset['geography_name'].value_counts().plot.barh(figsize=(8, 8))
ax.bar_label(ax.containers[0])
plt.xlabel("Count", labelpad=12)
plt.ylabel("Area in Athens", labelpad=12)
plt.title("Count of Listings in each Area of Athens", y=1.02);

In [None]:
# create a new column with decade information
decade = pd.cut(dataset['year_of_construction'], bins=range(int(dataset['year_of_construction'].min()), int(dataset['year_of_construction'].max()), 10), labels=[f"{i}s" for i in range(int(dataset['year_of_construction'].min()), 2150, 10)])

# # group by decade and count the number of rows in each group
# grouped_df = df.groupby('decade').size().reset_index(name='count')

# print(grouped_df)

In [None]:
# scatter plot for two numerical variables
plt.scatter(df['column_name1'], df['column_name2'])

In [None]:
# drop columns with high null values or low variance
df.drop(['column_name1', 'column_name2'], axis=1, inplace=True)

# fill null values with mean or mode
df['column_name'].fillna(df['column_name'].mean(), inplace=True)

# create dummy variables for categorical columns
df = pd.get_dummies(df, columns=['column_name'])

# scale numerical columns
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df[['column_name1', 'column_name2']] = scaler.fit_transform(df[['column_name1', 'column_name2']])