In [72]:
import pandas as pd

In [73]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Exploration

In [74]:
df_train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [75]:
x_train = df_train[['CentralAir', 'Utilities', 'GarageCars', 'ExterQual', 'Foundation', 'KitchenQual']]
y_train = df_train['SalePrice']
x_train

Unnamed: 0,CentralAir,Utilities,GarageCars,ExterQual,Foundation,KitchenQual
0,Y,AllPub,2,Gd,PConc,Gd
1,Y,AllPub,2,TA,CBlock,TA
2,Y,AllPub,2,Gd,PConc,Gd
3,Y,AllPub,3,TA,BrkTil,Gd
4,Y,AllPub,3,Gd,PConc,Gd
...,...,...,...,...,...,...
1455,Y,AllPub,2,TA,PConc,TA
1456,Y,AllPub,2,TA,CBlock,TA
1457,Y,AllPub,1,Ex,Stone,Gd
1458,Y,AllPub,1,TA,CBlock,Gd


# Preprocessing

## Dummy values

In [76]:
def get_dummies(df, col_name, list_values, prefix):
    for value in list_values:
        new_col_name = prefix + value
        df[new_col_name] =(df[col_name] == value).astype(int)
    del df[col_name]
    return df

In [77]:
x_train = get_dummies(x_train, 'Foundation', ['BrkTil', 'CBlock', 'PConc', 'Slab', 'Stone', 'Wood'], "Is_")
x_train = get_dummies(x_train, 'CentralAir', ['Y','N'], "Is_")
x_train = get_dummies(x_train, 'Utilities', ['AllPub', 'NoSewr', 'NoSeWa', 'ELO'], "As_")
x_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Unnamed: 0,GarageCars,ExterQual,KitchenQual,Is_BrkTil,Is_CBlock,Is_PConc,Is_Slab,Is_Stone,Is_Wood,Is_Y,Is_N,As_AllPub,As_NoSewr,As_NoSeWa,As_ELO
0,2,Gd,Gd,0,0,1,0,0,0,1,0,1,0,0,0
1,2,TA,TA,0,1,0,0,0,0,1,0,1,0,0,0
2,2,Gd,Gd,0,0,1,0,0,0,1,0,1,0,0,0
3,3,TA,Gd,1,0,0,0,0,0,1,0,1,0,0,0
4,3,Gd,Gd,0,0,1,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,2,TA,TA,0,0,1,0,0,0,1,0,1,0,0,0
1456,2,TA,TA,0,1,0,0,0,0,1,0,1,0,0,0
1457,1,Ex,Gd,0,0,0,0,1,0,1,0,1,0,0,0
1458,1,TA,Gd,0,1,0,0,0,0,1,0,1,0,0,0


## Enum str => int

In [78]:
qualityDict = {"Ex" : 4, "Gd" : 3, "TA" : 2, "Fa" : 1, "Po" : 0}
def get_qualitative(df, col_name, qualiDict):
    df[col_name] = df[col_name].map(qualityDict)
    return df

In [54]:
x_train = get_qualitative(x_train, 'KitchenQual', qualityDict)
x_train = get_qualitative(x_train, 'ExterQual', qualityDict)
x_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,GarageCars,ExterQual,KitchenQual,Is_BrkTil,Is_CBlock,Is_PConc,Is_Slab,Is_Stone,Is_Wood,Is_Y,Is_N,As_AllPub,As_NoSewr,As_NoSeWa,As_ELO
0,2,3,3,0,0,1,0,0,0,1,0,1,0,0,0
1,2,2,2,0,1,0,0,0,0,1,0,1,0,0,0
2,2,3,3,0,0,1,0,0,0,1,0,1,0,0,0
3,3,2,3,1,0,0,0,0,0,1,0,1,0,0,0
4,3,3,3,0,0,1,0,0,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,2,2,2,0,0,1,0,0,0,1,0,1,0,0,0
1456,2,2,2,0,1,0,0,0,0,1,0,1,0,0,0
1457,1,4,3,0,0,0,0,1,0,1,0,1,0,0,0
1458,1,2,3,0,1,0,0,0,0,1,0,1,0,0,0


# Model

In [79]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state = 0)
clf.fit(x_train, y_train)

ValueError: could not convert string to float: 'TA'

In [56]:
y_train_pred = clf.predict(x_train)
print(y_train_pred)

[230000 140000 230000 ... 266500 110000 160000]


In [71]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_train_pred)

0.04589041095890411