In [1]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PowerTransformer, StandardScaler
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
import warnings
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest

In [2]:
df_train = pd.read_csv('train.csv')
df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
def plot(df):
    l=[]
    s=0
    for i in df.columns:
        c=df[i].isnull().sum()
        if c>0:
            s+=1
            l.append(i)
    fig, axis = plt.subplots(figsize=(10, 3*s),nrows=s, ncols=2)
    axis=axis.flatten()
    for i in range(len(l)):
        if df[l[i]].nunique()<6:
            sns.kdeplot(df,x='SalePrice',hue=l[i],ax=axis[2*i+1])
        sns.scatterplot(df, x=l[i],y='SalePrice', ax=axis[2*i])
    plt.tight_layout()
# plot(df)

In [4]:
def get_duplicate_columns(df):
    
    duplicate_columns = {}
    seen_columns = {}

    for column in df.columns:
        current_column = df[column]

        # Convert column data to bytes
        try:
            current_column_hash = current_column.values.tobytes()
        except AttributeError:
            current_column_hash = current_column.to_string().encode()

        if current_column_hash in seen_columns:
            if seen_columns[current_column_hash] in duplicate_columns:
                duplicate_columns[seen_columns[current_column_hash]].append(column)
            else:
                duplicate_columns[seen_columns[current_column_hash]] = [column]
        else:
            seen_columns[current_column_hash] = column

    return duplicate_columns


In [5]:
pt = PowerTransformer()
sc = StandardScaler()
def BsmtExposure_map(v):
    if v=='Gd':
        return 4
    elif v== 'Av':
        return 3
    elif v=='Mn':
        return 2
    elif v=='No':
        return 1
    else:
        return 0
def BsmtFinType_map(f):
    if f=='GLQ':
        return 6
    elif f=='ALQ':
        return 5
    elif f=='BLQ':
        return 4
    elif f=='Rec':
        return 3
    elif f=='LwQ':
        return 2
    elif f=='Unf':
        return 1
    else:
        return 0
    
def garage_finish_map(v):
    if v=='Fin':
        return 3
    elif v== 'RFn':
        return 2
    elif v=='Unf':
        return 1
    else:
        return 0

def map_fill(f):
    if f=='Ex':
        return 5
    elif f=='Gd':
        return 4
    elif f=='TA':
        return 3
    elif f=='Fa':
        return 2
    elif f=='Po':
        return 1
    else:
        return 0
def Alley_map(v):
    if v=='Pave':
        return 2
    elif v == 'Grvl':
        return 1
    else:
        return 0

def clean(df,test=True):
    df.drop(['PoolQC','MiscFeature', 'Fence','GarageCars',
             'GarageCond','Utilities','Id',
              'Condition1', 'Exterior1st','Neighborhood'], 
            axis=1, inplace = True, errors= 'ignore')
    df['BsmtQual'] = df['BsmtQual'].apply(map_fill)
    df['BsmtCond'] = df['BsmtCond'].apply(map_fill)
    df['FireplaceQu']= df['FireplaceQu'].apply(map_fill)
    df['GarageQual'] = df['GarageQual'].apply(map_fill)
    df['ExterQual'] = df['ExterQual'].apply(map_fill)
    df['ExterCond'] = df['ExterCond'].apply(map_fill)
    df['HeatingQC'] = df['HeatingQC'].apply(map_fill)
    df['KitchenQual'] = df['KitchenQual'].apply(map_fill)
    df['BsmtFinType2'] = df['BsmtFinType2'].apply(BsmtFinType_map)
    df['BsmtFinType1'] = df['BsmtFinType1'].apply(BsmtFinType_map)
    df['BsmtExposure'] = df['BsmtExposure'].apply(BsmtExposure_map)
    df['GarageFinish']= df['GarageFinish'].apply(garage_finish_map)
    df.LotFrontage.interpolate('linear',inplace=True)
    df.GarageYrBlt.interpolate('linear',inplace=True)
    df.MasVnrArea.interpolate('linear',inplace=True)
    df.LotFrontage.interpolate('linear',inplace=True)
    df = pd.get_dummies(df, columns=['GarageType','Alley', 'MasVnrType'], 
                        drop_first=False, dtype='int')
    df = pd.get_dummies(df, columns=['MSZoning', 'LotShape', 'LandContour',
                                     'LotConfig','LandSlope', 
                                    'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle',
                                    'RoofMatl', 'Exterior2nd', 'Foundation',
                                    'Heating', 'CentralAir', 'Electrical', 'Functional', 
                                     'PavedDrive', 'SaleType','SaleCondition','Street'],
                        drop_first = True, dtype='int')
    # y= df['SalePrice']
    column = df.columns
    
    X=df.drop('SalePrice',errors='ignore', axis=1 ) 
    cont = X.select_dtypes(include =['int','float'])
    l_cont =[]
    for i in cont.columns:
        n = cont[i].nunique()
        if n is n>10:
            l_cont.append(i)
    if test:
        df[l_cont]=pt.fit_transform(df[l_cont])
    df = pd.DataFrame(df,columns = column)
    df=df.drop('SalePrice',errors='ignore', axis=1 )
    duplicate_columns = get_duplicate_columns(df)
    for one_list in duplicate_columns.values():
        df.drop(columns=one_list,inplace=True)
    sel = VarianceThreshold(threshold=0.05)
    sel.fit(df)
    columns = df.columns[sel.get_support()]
    df = sel.transform(df)
    df = pd.DataFrame(df, columns=columns)
    corr_matrix = df.corr()
    columns = corr_matrix.columns
    # Create an empty list to keep track of columns to drop
    columns_to_drop = []
    # Loop over the columns
    for i in range(len(columns)):
        for j in range(i + 1, len(columns)):
            # Access the cell of the DataFrame
            if corr_matrix.loc[columns[i], columns[j]] > 0.95:
                columns_to_drop.append(columns[j])
    columns_to_drop = set(columns_to_drop)
    df.drop(columns = columns_to_drop, axis = 1, inplace=True)
    return df

In [6]:
import warnings
warnings.simplefilter('ignore')

In [7]:
# X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=34)
df_train = pd.read_csv('train.csv')
y = df_train['SalePrice']
X = clean(df_train)
# X['SalePrice']
df_test = pd.read_csv('test.csv')
id = df_test['Id']
test = clean(df_test)

In [8]:
naot_present_in_test = list(set(test.columns)^set(X_normal.columns))
X=X.drop(not_present_in_test,axis=1,errors='ignore')
test = test.drop(not_present_in_test,axis=1,errors='ignore')
lr = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True,)
mae_score = cross_val_score(lr, X, y, cv=kfold, scoring='neg_mean_absolute_error')
mse_score = cross_val_score(lr, X, y, cv=kfold, scoring='neg_mean_squared_error')
r2_score = cross_val_score(lr, X, y, cv=kfold, scoring='r2')
print('mean_squared_error', -mse_score.mean())
print('absolute_error', -mae_score.mean())
print('r2_score', r2_score.mean())

NameError: name 'X_normal' is not defined

In [9]:
df_train = pd.read_csv('train.csv')
Q1= np.percentile(df_train['SalePrice'],25)
Q3= np.percentile(df_train['SalePrice'],75)
IQR = Q3-Q1
min1 = Q1-(3/2)*IQR
max1= Q3+(3/2)*IQR
df_normal = df_train[(df_train.SalePrice<max1)]
y_normal = df_normal['SalePrice']
X_normal = clean(df_normal)
print('X_normal shape',X_normal.shape)
X_normal= X_normal.drop('SalePrice',errors='ignore', axis=1 ) 
not_present_in_test = list(set(test.columns)^set(X_normal.columns))
X_normal=X_normal.drop(not_present_in_test,axis=1,errors='ignore')
print('X_normal shape just before traing',X_normal.shape)
lr = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True,)
mae_score = cross_val_score(lr, X_normal, y_normal, cv=kfold, scoring='neg_mean_absolute_error')
mse_score = cross_val_score(lr, X_normal, y_normal, cv=kfold, scoring='neg_mean_squared_error')
r2_score = cross_val_score(lr, X_normal, y_normal, cv=kfold, scoring='r2')
print('mean_squared_error', -mse_score.mean())
print('absolute_error', -mae_score.mean())
print('r2_score', r2_score.mean())

X_normal shape (1399, 76)
X_normal shape just before traing (1399, 75)
mean_squared_error 611988514.0329378
absolute_error 17003.368611234786
r2_score 0.8203342074261236


In [10]:
df_outliers = df_train[(df_train.SalePrice>=max1)]
y_out = df_outliers['SalePrice']
df_outliers = clean(df_outliers,test=False)
X_out = df_outliers.drop(not_present_in_test,axis=1,errors='ignore')
X_out= X_out.drop('SalePrice',errors='ignore', axis=1 ) 
print('X_out shape just before traing', X_out.shape)
lr = LinearRegression()
kfold = KFold(n_splits=5, shuffle=True,)
mae_score = cross_val_score(lr, X_out, y_out, cv=kfold, scoring='neg_mean_absolute_error')
mse_score = cross_val_score(lr, X_out, y_out, cv=kfold, scoring='neg_mean_squared_error')
r2_score = cross_val_score(lr, X_out, y_out, cv=kfold, scoring='r2')
print('mean_squared_error_out', -mse_score.mean())
print('absolute_error_out', -mae_score.mean())
print('r2_score_out', r2_score.mean())

X_out shape just before traing (61, 62)
mean_squared_error_out 70522650273.85545
absolute_error_out 148222.89087786665
r2_score_out -4.394958420988902


In [17]:
X_normal.shape

(1399, 75)

In [18]:
lr.fit(X,y)
print('test data shape after cleaning',test.shape)
test=test.drop(not_present_in_test,axis=1,errors='ignore')
test.fillna(value = test.mean(axis=0),inplace=True)
final = lr.predict(test)
final_df = pd.DataFrame({'Id':id,'SalePrice':final})
final_df.to_csv('final_df.csv',index=False)
f = pd.read_csv('final_df.csv')
f.head()

test data shape after cleaning (1459, 75)


Unnamed: 0,Id,SalePrice
0,1461,122828.021617
1,1462,179932.312407
2,1463,180349.208021
3,1464,193521.092117
4,1465,195659.975856


In [3]:
from sklearn.tree import DecisionTreeRegressor

In [20]:
naot_present_in_test = list(set(test.columns)^set(X_normal.columns))
X=X.drop(not_present_in_test,axis=1,errors='ignore')
test = test.drop(not_present_in_test,axis=1,errors='ignore')
lr = DecisionTreeRegressor(min_samples_leaf=10)
kfold = KFold(n_splits=5, shuffle=True,)
mae_score = cross_val_score(lr, X, y, cv=kfold, scoring='neg_mean_absolute_error')
mse_score = cross_val_score(lr, X, y, cv=kfold, scoring='neg_mean_squared_error')
r2_score = cross_val_score(lr, X, y, cv=kfold, scoring='r2')
print('mean_squared_error', -mse_score.mean())
print('absolute_error', -mae_score.mean())
print('r2_score', r2_score.mean())

mean_squared_error 1478912601.909028
absolute_error 25336.73340292829
r2_score 0.7309126958128338
