In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("BHP.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
for col in df.columns:
    print(df[col].value_counts())
    print("*"*20)

In [None]:
df.isna().sum()

In [None]:
df.drop(columns=['area_type','availability','society','balcony'],inplace=True)

In [None]:
df.describe()

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# filling up the missing value in location
missing_value  = df[df['location'].isnull()]
print(missing_value)

In [None]:
df['location'] = df['location'].fillna('Sarjapur  Road')

In [None]:
df['location'].isna().sum()

In [None]:
# Filling up missing values in Size
missing_value  = df[df['size'].isnull()]
print(missing_value)

In [None]:
df['size'] = df['size'].fillna('2 BHK')

In [None]:
df['size'].isna().sum()

In [None]:
#  Filling missing values in Bath
missing_value  = df[df['bath'].isnull()]
print(missing_value)

In [None]:
df['bath'] = df['bath'].fillna(df['bath'].median())

In [None]:
df['bath'].isna().sum()

In [None]:
df.info()

In [None]:
#  fixing the BHK column
df['bhk'] = df['size'].str.split().str.get(0).astype(int)

In [None]:
df.head()

In [None]:
#  Fixing the total_sqft column
df['total_sqft'].unique()

In [None]:
def convert_range(x):
    
    temp = x.split("-")
    if len(temp) == 2:
        return (float(temp[0]) + float(temp[1]))/2
    
    try:
        return float(x)
    except:
        return None

In [None]:
df['total_sqft'] = df['total_sqft'].apply(convert_range)

In [None]:
df['total_sqft'].unique()

In [None]:
# Create a new column price per sqft
df['price_per_sqft'] = df['price']*100000 / df['total_sqft']

In [None]:
df.describe()

In [None]:
#  reducing the value_count in location
df['location'] = df['location'].apply(lambda x:x.strip())
location_count = df['location'].value_counts()

In [None]:
location_count_under_10 = location_count[location_count <= 10]
location_count_under_10

In [None]:
df['location'] = df['location'].apply(lambda x: 'other' if x in location_count_under_10 else x)

In [None]:
df['location'].value_counts()

In [None]:
#  Removing outliers:

df = df[((df['total_sqft']/df['bhk']) >= 300)]
df.describe()

In [None]:
def remove_outliers_sqft(df):
    df_output = pd.DataFrame()
    for key, subdf in df.groupby('location'):
        m = np.mean(subdf.price_per_sqft)
        st = np.std(subdf.price_per_sqft)
        gen_df = subdf[(subdf.price_per_sqft > (m - st)) & (subdf.price_per_sqft <= (m + st)) ]
        df_output = pd.concat([df_output,gen_df],ignore_index = True)
    return df_output
df = remove_outliers_sqft(df)
df.describe()

In [None]:
def bhk_outlier_remover(df):
    exclude_indices = np.array([])
    for location, location_df in df.groupby('location'):
        bhk_stats = {}
        for bhk, bhk_df in location_df.groupby('bhk'):
            bhk_stats[bhk] ={
                'mean' : np.mean(bhk_df.price_per_sqft),
                'std' : np.std(bhk_df.price_per_sqft),
                'count' : bhk_df.shape[0]
            }
        for bhk, bhk_stats in location_df.groupby('bhk'):
            stats = bhk_stats.get(bhk - 1)
            if stats and stats['count'] > 5:
                exclude_indices = np.append(exclude_indices, bhk_df[bhk_df.price_per_sqft < (stats['mean'])].index.values)
    return df.drop(exclude_indices, axis='index')

In [None]:
df = bhk_outlier_remover(df)

In [None]:
df.shape

In [None]:
df.drop(columns=['size','price_per_sqft'],inplace=True)

In [None]:
df

In [None]:
df.to_csv('Cleaned_data.csv')

In [None]:
X = df.drop(columns = ['price'])
y = df['price']

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state =2)

In [None]:
print(X_train.shape)
print(X_test.shape)

Applying Linear Regression


In [None]:
column_trans = make_column_transformer((OneHotEncoder(sparse = False),['location']), remainder = 'passthrough')
scaler = StandardScaler()
lr = LinearRegression()

In [None]:
pipe = make_pipeline(column_trans, scaler, lr)

In [None]:
pipe.fit(X_train,y_train)

In [None]:
y_pred_lr = pipe.predict(X_test)

In [None]:
r2_score(y_test,y_pred_lr)

Lasso Regression


In [None]:
lasso = Lasso()

In [None]:
pipe = make_pipeline(column_trans, scaler, lasso)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred_lasso = pipe.predict(X_test)

In [None]:
r2_score(y_test,y_pred_lasso)

Ridge Regression


In [None]:
ridge = Ridge()

In [None]:
pipe = make_pipeline(column_trans, scaler, ridge)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
y_pred_ridge = pipe.predict(X_test)

In [None]:
r2_score(y_test, y_pred_ridge)

In [None]:
import pickle

In [None]:
pickle.dump(pipe, open('RidgeModel.pkl','wb'))

In [None]:
pipe = pickle.load(open('RidgeModel.pkl', 'rb'))

# Prepare input data with correct columns
input_data = pd.DataFrame([{
    'location': '1st Phase JP Nagar',
    'total_sqft': 1500,
    'bath': 2,
    'bhk': 2
}])

# Predict
result = pipe.predict(input_data)


In [None]:
result

In [None]:
import sklearn
print(sklearn.__version__) 

In [None]:
!pip install --upgrade scikit-learn


In [None]:
import sklearn
print(sklearn.__version__)

In [None]:
import sys
!{sys.executable} -m pip install scikit-learn
