In [None]:
#Import important libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
from scipy import stats
import sklearn
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge  
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings('ignore')


from scipy import stats
from scipy.stats import norm, skew

In [None]:
#importing the data into dataframes
train_data = pd.read_csv('train_house.csv',low_memory=False)
test_data = pd.read_csv('test_house.csv',low_memory=False)

In [None]:
train_data.head()

In [None]:
test_data.info()

## Data Exploration

In [None]:
def wrangle(datapath):
    #read data into dataframe
    df = pd.read_csv(datapath)
    
    #get list of features with above 20% missing values
    mask = df.isna().sum()[df.isna().sum()/len(df) > 0.2].keys().tolist()
    #drop the features above with above 20% missing value
    df.drop(columns=mask,inplace=True)
    

    #trimming the bottom and top 10% of properties in terms of "surface_covered_in_m2"
    low, high = df["MSSubClass"].quantile([0.1, 0.9])
    mask_area = df["MSSubClass"].between(low, high)
    df = df[mask_area] 
    
    #group unique feature values in 3 classes (multicollinerity columns)
    mid_unique_values =[]
    large_unique_values =[]
    target = []
    #concatinating features with mid and low counts of unique values
    for col in df.columns:
        value = df[col].nunique()
        if value < 25:
                mid_unique_values.append(col)
        elif col == "SalePrice":
            target.append(col)
        else:
                large_unique_values.append(col)
               
    data = pd.concat([df[mid_unique_values],df[target]],axis=1)
    return data

In [None]:
train = wrangle('train_house.csv')
test = wrangle('test_house.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.shape)
print(train.info())
print(train.columns.nunique())

In [None]:
print(test.shape)
print(test.info())
print(test.columns.nunique())

In [None]:
#recasting float types in test as ints
y_train = train["SalePrice"]
train.drop(columns="SalePrice",inplace=True)

#drop features not common to both train and test datasets
off = []
for col in train.columns:
      if col not in test.columns:
            off.append(col)
train.drop(columns=off,inplace=True)            

In [None]:
#Get cordinality of categorical features and output result as a dataframe for the train data
categorical_data_train = train.select_dtypes(include='object').copy()

count = [len(train[features].unique()) for features in categorical_data_train]
data_tuples = list(zip(categorical_data_train,count))
data = pd.DataFrame(data_tuples, columns=['Features','Number of distinct values'])
data

In [None]:
#mask high cordinality feature in the train dataset (categorical)
large_unique_value_train =[]
for col in categorical_data_train.columns:
    value = len(categorical_data_train[col].unique())
    if value > 5:
        large_unique_value_train.append(col)
print(large_unique_value_train)

In [None]:
#caste of high cordinality features for train dataset
for val in large_unique_value_train:
    top_5_train = train[val].value_counts().head(5)
    train[val] = train[val].apply(lambda c: c if c in top_5_train else "Others")
    

In [None]:
#Get cordinality of numerical features and output result as a dataframe for train data
int_data_train = train.select_dtypes(include='int64').copy()
#for feature in features:
count = [len(train[features].unique()) for features in int_data_train]
data_tuple = list(zip(int_data_train,count))
data = pd.DataFrame(data_tuple, columns=['Features','Number of distinct values'])
data

In [None]:
#concatinating features with mid and low counts of unique values numerical train data
large_unique_int =[]
for col in int_data_train.columns:
    value = len(int_data_train[col].unique())
    if value > 5:
        large_unique_int.append(col)
print(large_unique_int)

In [None]:
#caste of high cordinality features numerical train data
for val in large_unique_int:
    top_5 = train[val].value_counts().head(5)
    train[val] = train[val].apply(lambda c: c if c in top_5 else 0)

In [None]:
#Get cordinality of numerical features and output result as a dataframe for test data
categorical_data_test = test.select_dtypes(include='object').copy()
#for feature in features:
count = [len(test[features].unique()) for features in categorical_data_test]
data_tuples = list(zip(categorical_data_test,count))
data = pd.DataFrame(data_tuples, columns=['Features','Number of distinct values'])
data

In [None]:
#mask high cordinality feature in the test dataset (categorical)
large_unique_value_test =[]
for col in categorical_data_test.columns:
    value = len(categorical_data_test[col].unique())
    if value > 5:
        large_unique_value_test.append(col)
print(large_unique_value_test)

In [None]:
#caste of high cordinality features for test dataset
for val in large_unique_value_test:
    top_5_test = test[val].value_counts().head(5)
    test[val] = test[val].apply(lambda c: c if c in top_5_test else "Others")
    

In [None]:
#Get cordinality of numerical features and output result as a dataframe for test data
int_data_test = test.select_dtypes(include='int64').copy()
#for feature in features:
count = [len(test[features].unique()) for features in int_data_test]
data_tuple = list(zip(int_data_test,count))
data = pd.DataFrame(data_tuple, columns=['Features','Number of distinct values'])
data

In [None]:
#concatinating features with mid and low counts of unique values numerical test data
large_unique_int_test =[]
for col in int_data_test.columns:
    value = len(int_data_test[col].unique())
    if value > 5:
        large_unique_int_test.append(col)
print(large_unique_int_test)

In [None]:
#caste of high cordinality features numerical train data
for val in large_unique_int_test:
    top_5 = test[val].value_counts().head(5)
    test[val] = test[val].apply(lambda c: c if c in top_5 else 0)

In [None]:
#Statistical outlook of the numerical features
train.describe()

In [None]:
#getting columns with missing values for train dataset
missing_col = train.isna().sum()[train.isna().sum() != 0].index.tolist()

In [None]:
missing_col = test.isna().sum()[test.isna().sum() != 0].index.tolist()

In [None]:
 #fill NAN with mean value.
for col in missing_col:
    train[col] = train[col].fillna(value='mean')
    

In [None]:
#Using label_encoder for all categorical features
train_copy = train.copy()
for c in categorical_data_train:
    label_encoder = LabelEncoder() 
    label_encoder.fit(list(train_copy[c].values)) 
    train_copy[c] = label_encoder.transform(list(train_copy[c].values))
    
train=train_copy.copy()
train.info()

In [None]:
train.info()

In [None]:
#Using label_encoder for all categorical features
test_copy = test.copy()
for c in categorical_data_test:
    label_encoder = LabelEncoder() 
    label_encoder.fit(list(test_copy[c].values)) 
    test_copy[c] = label_encoder.transform(list(test_copy[c].values))
    
test=test_copy.copy().dropna()
test.info()

In [None]:
#train = clean_train(train)
#test = clean_train(test)

In [None]:
# Build Model
model = Ridge()

# Fit model
model.fit(train,y_train)

In [None]:
test.info()

In [None]:
y_test_pred = pd.Series(model.predict(test))
y_test_pred.head()

In [None]:
coefficients = model.coef_
features = train.columns
feat_imp = pd.Series(coefficients,index=features).sort_values(key=abs).tail(15)
feat_imp 

In [None]:
# Build bar chart for the 15 most influential coefficients of the model
feat_imp.plot(kind="barh")
# Label axes
plt.xlabel("Importance [USD]")
plt.ylabel("Feature")
# Add title
plt.title("Feature Importances for Apartment Price");

In [None]:
#save the model for future use,using pickle
import pickle 

pickle.dump(model, open('model_house.pkl', 'wb'))