# We are dealing with a data set of California housing prices.
## The business model of this dataset is made to predict the prices of houses.

In [None]:
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from IPython.display import HTML, display

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder


In [None]:
data = pd.read_csv (r'../input/california-housing-prices/housing.csv')

In [None]:
plt.rcParams['axes.facecolor'] = 'Beige'

# Exploratory Data Analysis (EDA)


## Take a quick look at the data:

In [None]:
data.tail()

## Check column number, types:

In [None]:
data.info()

## 11 Columns

In [None]:
data.columns

In [None]:
data.hist(bins=25,figsize=(20,10));

## Visualizing Geographical Data


In [None]:
data.plot(kind="scatter",x="longitude",y="latitude",alpha=0.1)

In [None]:
ocean_values = data["ocean_proximity"].value_counts()
plt.figure(figsize=(10,6))
sns.countplot(x = "ocean_proximity",data=data,order=ocean_values.index)

# To show the percenotge
for i in range(ocean_values.shape[0]):
    count = ocean_values[i] 
    strt='{:0.2f}%'.format(100*count / data.shape[0]) 
    plt.text(i, count+100, strt, ha='center', color='black', fontsize=14) 

In [None]:
data.plot(kind="scatter",x="longitude",y="latitude",alpha=0.4,
         s=data["population"]/100,label="population",
         c="median_house_value",cmap=plt.get_cmap("jet"),
         colorbar=True)
plt.legend()

## Understanding our data:
### We use correlation matrix 

In [None]:
corr_matrix=data.corr()
corr_matrix

### But Plotting a correlation plot is actually clearer 

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(data.corr(), annot=True, cmap="YlGnBu")

### To understand the correlation plot: it creates a relation between 2 variables and checks if it's proportional(higher than 0.5) or inversely proportional (less than -0.5) , or no relation (close to zero)
#### Understanding the correlation between columns helps you make a better model as the insertion of lesser important columns will actually cause bias or corrupt the model.
#### For example: We actually have no correlation between housing median age and other columns, but it's an important data that should be considered, also no correlation between longitude, latitude and other columns, but dropping it now will actually corrupt data, we comlete missing values, thn we drop

## Is there missing data in our dataset?

In [None]:
print(data.isnull().values.any())

## But where and how much is the missing data?

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap( data.isnull())

#### Total bedrooms column has some missing data

In [None]:
# Check the percentage of Nan in dataset
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data['population'].count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

### We have 207 missing data in total bedrooms 

## Cleaning the data:
### We will remove any rows that are all Null as it will corrupt the data

In [None]:
print(data.dropna(axis=0,how ='all'))

### But if there's some rows with alot of missing values, we should remove all rows with more than 2 missing values as filling more than 2 rows could corrupt the data, we use threshold = 8.

In [None]:
print(data.dropna(axis=0,thresh=8).tail())

## Before completing the missing values, we check the distribution to decide the better representation of central tendency:

In [None]:
plt.figure(figsize = (10,5))
sns.boxplot(x = 'total_bedrooms',palette = 'cool', data=data)

In [None]:
sns.histplot(x= 'total_bedrooms',data=data)
plt.show()


#### It's negatively skewed with alof of outliers so we use "Median" as central tendency representaion.

## filling missing data:
### We should check that all values are numerical.

In [None]:
data.info()

### Ocean proximity is categorical column than needs to be converted to numerical.
#### We use encoding method

In [None]:
le = LabelEncoder()
data['ocean_proximity']=le.fit_transform(data['ocean_proximity'])
data['ocean_proximity'].value_counts()

In [None]:
data.info()

## Filling missing using imputer

In [None]:
# data['total_bedrooms'].fillna(int(data['total_bedrooms'].median()),inplace=True) is a method but nah
# After we have actually imported simple imputer from sk learn
imputer = SimpleImputer(strategy='median', missing_values=np.nan)
imputer = imputer.fit(data[['total_bedrooms']])
data['total_bedrooms'] = imputer.transform(data[['total_bedrooms']])
data

### We check if there's a missing data still:

In [None]:
print(data.isnull().values.any())

### We reomve longitude, lattitude as the correlations are small.

In [None]:
data = data.drop(columns=(['longitude','latitude']))
data.columns

# Outliers:

### To find outliers we use describe 

In [None]:
data.describe().T

In [None]:
plt.figure(figsize = (15,8))
sns.boxplot(palette = 'cool', data=data)

### We foud outliers that is better be located using a scatter plot

In [None]:
sns.regplot(x='population', y= 'median_income', data = data)

In [None]:
sns.scatterplot(x='total_rooms', y= 'total_bedrooms', data = data)

In [None]:
sns.scatterplot(x='median_income', y= 'median_house_value', data = data)

In [None]:
sns.scatterplot(x='median_income', y= 'households', data = data)

In [None]:
data = data.drop(data[data['total_rooms']>16000].index)
data = data.drop(data[data['total_bedrooms']>2500].index)
data = data.drop(data[data['population']>7000].index)
data = data.drop(data[data['households']>2500].index)
data = data.drop(data[data['median_income']>12].index)

In [None]:
sns.regplot(x='population', y= 'total_rooms', data = data)

In [None]:
plt.figure(figsize = (15,8))
sns.boxplot(palette = 'cool', data=data)

## Feature engineering:
### We can add 2 more columns


In [None]:
#data["rooms_per_household"]=data["total_rooms"]/data["households"]
#data["bedrooms_per_room"]=data["total_bedrooms"]/data["total_rooms"]
#data["population_per_household"]=data["population"]/data["households"] 

## Splitting data 

## Splitting:

In [None]:
reg = linear_model.LinearRegression()

### We rearrange columns: 
    

In [None]:
data = data[['housing_median_age',
       'total_rooms', 'total_bedrooms', 'population', 'households',
       'median_income', 'ocean_proximity', "median_house_value"]]
data

### To create the proper comparison we drop house value column from x and make it y

In [None]:
x = data.drop(['median_house_value'] , axis = 1).values
y= data['median_house_value' ].values

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train , x_test , y_train , y_test = train_test_split(x,y , test_size= 0.25 , random_state=42)

In [None]:
x_train.shape

In [None]:
y_train.shape

# Scaling
### Make sure all data aren't categorical

In [None]:
data.info()

In [None]:
from sklearn.preprocessing import RobustScaler

#### We used Robust scaler to reduce the effect 

In [None]:
ro_scaler = RobustScaler()
x_train = ro_scaler.fit_transform(x_train)
x_test = ro_scaler.fit_transform(x_test)

In [None]:
x_train.shape

## Linear regression

In [None]:
reg.fit(x_train , y_train)
reg.score(x_train , y_train)


In [None]:
reg.score(x_test , y_test)

In [None]:
reg.coef_

In [None]:
pd.DataFrame(reg.coef_ , data.columns[:-1] ,  columns=['Coeficient'])

In [None]:
y_pred =reg.predict(x_test)
data = pd.DataFrame({"Y_test": y_test , "Y_pred" : y_pred})
data.head(10)

In [None]:
plt.figure(figsize=(10,8))
plt.plot(data[:50])
plt.legend(["Actual" , "Predicted"])

# Evaluation:

In [None]:
from sklearn.metrics import mean_absolute_error,r2_score,mean_squared_error
import numpy as np
def run_experiment(model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    print("R^2 : ", r2_score(y_test, y_pred))
    print("MAE :", mean_absolute_error(y_test,y_pred))
    print("RMSE:",np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
run_experiment(model)

### Another regression evaluation model

In [None]:
from sklearn.linear_model import SGDRegressor
model = SGDRegressor()
run_experiment(model)

In [None]:
def run_experiment(model):
    model.fit(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    plot_confusion_matrix(model, x_test, y_test, cmap='GnBu')
    plt.show()
    print('Precision: %.3f' % precision_score(y_test, y_pred))
    print('Recall: %.3f' % recall_score(y_test, y_pred))
    print('F1: %.3f' % f1_score(y_test, y_pred))
    print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))