In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
filecsv = ("/kaggle/input/harga-real-estate/harga_real_estate.csv")
df = pd.read_csv(filecsv, usecols =['X1_Usia_Rumah','X2_jarak_MRT','X3_jumlah_toko', 'Y_harga_unit'])
df.head()

In [None]:
#Melihat 5 baris teratas dari data
#Independent variable (x) adalah X1_Usia_Rumah, X2_jarak_MRT,X3_jumlah_toko, Y_harga_unit
#Dependent variable (y) adalah price
df.head()

In [None]:
#Mengetahui jumlah kolom dan baris dari data.
#Data Rohman Mempunyai 6 kolom (features) dengan 22909 baris.

df.shape

In [None]:
#Melihat Informasi data Rohman mulai dari jumlah data, tipe data, memory yang digunakan dll.
#Dapat dilihat bahwa seluruh data sudah di dalam bentuk numerik.

df.info()

In [None]:
#Melihat Statisficial description data mulai dari mean, kuartil, standard deviation dll.

df.describe()

In [None]:
#Mencari dan menangani missing values.
#dan datanya sudah tidak ada missing values

df.isnull().sum()

In [None]:
#Univariate analysis X1_Usia_Rumah.

f = plt.figure(figsize=(12,4))

f.add_subplot(1,2,1)
sns.countplot(df['X1_Usia_Rumah'])

f.add_subplot(1,2,2)
plt.boxplot(df['X1_Usia_Rumah'])
plt.show()

In [None]:
#Univariate analysis X2_jarak_MRT.

f = plt.figure(figsize=(12,4))

f.add_subplot(1,2,1)
sns.countplot(df['X2_jarak_MRT'])

f.add_subplot(1,2,2)
plt.boxplot(df['X2_jarak_MRT'])
plt.show()

In [None]:
#Univariate analysis X2_jarak_MRT.
f = plt.figure(figsize=(12,4))

f.add_subplot(1,2,1)
df['X2_jarak_MRT'].plot(kind='kde')

f.add_subplot(1,2,2)
plt.boxplot(df['X2_jarak_MRT'])
plt.show()

In [None]:
#Univariate analysis X3_jumlah_toko.

f = plt.figure(figsize=(12,4))

f.add_subplot(1,2,1)
sns.countplot(df['X3_jumlah_toko'])

f.add_subplot(1,2,2)
plt.boxplot(df['X3_jumlah_toko'])
plt.show()

In [None]:
#Univariate analysis Y_harga_unit.

f = plt.figure(figsize=(12,4))

f.add_subplot(1,2,1)
df['Y_harga_unit'].plot(kind='kde')

f.add_subplot(1,2,2)
plt.boxplot(df['Y_harga_unit'])
plt.show()

In [None]:
#Univariate analysis Y_harga_unit.

f = plt.figure(figsize=(12,4))

f.add_subplot(1,2,1)
sns.countplot(df['Y_harga_unit'])

f.add_subplot(1,2,2)
plt.boxplot(df['Y_harga_unit'])
plt.show()


In [None]:
#Bivariate analysis antara indepent variable dan dependent variable.

plt.figure(figsize=(10,8))
sns.pairplot(data=df, x_vars=['X1_Usia_Rumah', 'X2_jarak_MRT', 'X3_jumlah_toko', 'Y_harga_unit'], y_vars=['Y_harga_unit'], size=5, aspect=0.75)
plt.show()

In [None]:
#mengetahui nilai dari independent variable dan dependent variable.

df.corr() .style.background_gradient() .set_precision(2)

In [None]:
#Pertama, buat variable x dan y.
x = df.drop(columns='Y_harga_unit')
y = df['Y_harga_unit']

#Kedua,Rohman split data menjadi training and testinf dengan porsi 80;20.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=4)

#Ketiga, Rohman bikin object linear regresi.
lin_reg = LinearRegression()

#Keempat, train the model menggunakan training data yang sudah displit.
lin_reg.fit(x_train, y_train)

#Kelima, cari tau nilai slope/koefisien (m) dan intercept(b).

print(lin_reg.coef_)
print(lin_reg.intercept_)

In [None]:
#keenam, cari tahu accuracy score dari model menggunakan testing data yang sudah displit.

lin_reg.score(x_test, y_test)

In [None]:
coef_dict = {
    'features':x.columns,
    'coef_value':lin_reg.coef_
}
coef = pd.DataFrame(coef_dict, columns=['features', 'coef_value'])
coef