In [5]:
import pandas as pd

In [6]:
homeprices_original_df = pd.read_csv("homeprices.csv")

In [7]:
homeprices_original_df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [8]:
# label encoding is to convert string to number
# one hot encoding is to have 0 and 1 combinations

In [9]:
homeprices_original_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
town     13 non-null object
area     13 non-null int64
price    13 non-null int64
dtypes: int64(2), object(1)
memory usage: 392.0+ bytes


In [10]:
newDf = pd.get_dummies(homeprices_original_df[["town"]]) # pass the column against which you want one hot encoding

In [11]:
newDf.head()

Unnamed: 0,town_monroe township,town_robinsville,town_west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0


In [12]:
newDf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
town_monroe township    13 non-null uint8
town_robinsville        13 non-null uint8
town_west windsor       13 non-null uint8
dtypes: uint8(3)
memory usage: 119.0 bytes


In [13]:
homeprices_original_df[["town_monroe township","town_robinsville","town_west windsor"]] = newDf

In [14]:
homeprices_original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 6 columns):
town                    13 non-null object
area                    13 non-null int64
price                   13 non-null int64
town_monroe township    13 non-null uint8
town_robinsville        13 non-null uint8
town_west windsor       13 non-null uint8
dtypes: int64(2), object(1), uint8(3)
memory usage: 431.0+ bytes


In [15]:
homeprices_original_df

Unnamed: 0,town,area,price,town_monroe township,town_robinsville,town_west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [16]:
homeprices_original_df.drop("town", axis="columns", inplace=True) # dropping the column along columns axis

In [17]:
homeprices_original_df.drop("town_west windsor", axis="columns", inplace=True)

In [18]:
homeprices_original_df.head()

Unnamed: 0,area,price,town_monroe township,town_robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0


In [19]:
homeprices_original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 4 columns):
area                    13 non-null int64
price                   13 non-null int64
town_monroe township    13 non-null uint8
town_robinsville        13 non-null uint8
dtypes: int64(2), uint8(2)
memory usage: 314.0 bytes


In [20]:
homeprices_original_df.columns

Index(['area', 'price', 'town_monroe township', 'town_robinsville'], dtype='object')

In [21]:
from sklearn import linear_model

In [22]:
model = linear_model.LinearRegression()

In [23]:
x_train = homeprices_original_df[["area","town_monroe township","town_robinsville"]].values

In [24]:
y_train = homeprices_original_df[["price"]].values

In [25]:
model.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [26]:
model.coef_

array([[   126.89744141, -40013.97548914, -14327.56396474]])

In [27]:
model.intercept_

array([249790.36766293])

In [28]:
y_predicted = model.predict(x_train)

In [29]:
model.predict([[2600,1,0]]) # actual is 550000

array([[539709.7398409]])

In [30]:
model.score(x_train, y_train) # accuracy of the model 

0.9573929037221873

In [31]:
# now checking how to do one hot encoding using sklearn prepossessing

In [32]:
homeprices_original_df = pd.read_csv("homeprices.csv")

In [33]:
homeprices_original_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 3 columns):
town     13 non-null object
area     13 non-null int64
price    13 non-null int64
dtypes: int64(2), object(1)
memory usage: 392.0+ bytes


In [34]:
homeprices_original_df.head()

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000


In [35]:
from sklearn.preprocessing import OneHotEncoder

In [36]:
# while instantiating one hot encoder, tell which all are categorical columns

In [37]:
oneHotEncoder = OneHotEncoder(categorical_features=[0]) 

In [38]:
type(oneHotEncoder)

sklearn.preprocessing._encoders.OneHotEncoder

In [39]:
# one hot encoding can not be applied directly on string data. It has to be first converted into numbers using label encoder.
onehotencoded_data = oneHotEncoder.fit_transform(homeprices_original_df).toarray()



ValueError: could not convert string to float: 'monroe township'

In [40]:
from sklearn.preprocessing import LabelEncoder

In [41]:
labelEnoder = LabelEncoder()

In [42]:
homeprices_original_df["encoded_town"] = labelEnoder.fit_transform(homeprices_original_df[["town"]].values)

  y = column_or_1d(y, warn=True)


In [43]:
homeprices_original_df

Unnamed: 0,town,area,price,encoded_town
0,monroe township,2600,550000,0
1,monroe township,3000,565000,0
2,monroe township,3200,610000,0
3,monroe township,3600,680000,0
4,monroe township,4000,725000,0
5,west windsor,2600,585000,2
6,west windsor,2800,615000,2
7,west windsor,3300,650000,2
8,west windsor,3600,710000,2
9,robinsville,2600,575000,1


In [44]:
oneHotEncoder = OneHotEncoder()

In [49]:
encoded_data = oneHotEncoder.fit_transform(homeprices_original_df[["encoded_town"]]).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [51]:
encoded_data

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [53]:
homeprices_original_df["monroe township.encoded"] = encoded_data[:,0]

In [54]:
homeprices_original_df

Unnamed: 0,town,area,price,encoded_town,monroe township.encoded
0,monroe township,2600,550000,0,1.0
1,monroe township,3000,565000,0,1.0
2,monroe township,3200,610000,0,1.0
3,monroe township,3600,680000,0,1.0
4,monroe township,4000,725000,0,1.0
5,west windsor,2600,585000,2,0.0
6,west windsor,2800,615000,2,0.0
7,west windsor,3300,650000,2,0.0
8,west windsor,3600,710000,2,0.0
9,robinsville,2600,575000,1,0.0


In [58]:
homeprices_original_df["west windsor.encoded"] = encoded_data[:,2]

In [59]:
homeprices_original_df["robinsville.encoded"] = encoded_data[:,1]

In [60]:
homeprices_original_df

Unnamed: 0,town,area,price,encoded_town,monroe township.encoded,west windsor.encoded,robinsville.encoded
0,monroe township,2600,550000,0,1.0,0.0,0.0
1,monroe township,3000,565000,0,1.0,0.0,0.0
2,monroe township,3200,610000,0,1.0,0.0,0.0
3,monroe township,3600,680000,0,1.0,0.0,0.0
4,monroe township,4000,725000,0,1.0,0.0,0.0
5,west windsor,2600,585000,2,0.0,1.0,0.0
6,west windsor,2800,615000,2,0.0,1.0,0.0
7,west windsor,3300,650000,2,0.0,1.0,0.0
8,west windsor,3600,710000,2,0.0,1.0,0.0
9,robinsville,2600,575000,1,0.0,0.0,1.0


In [63]:
homeprices_original_df.drop("robinsville.encoded", axis="columns", inplace=True)

In [64]:
homeprices_original_df

Unnamed: 0,town,area,price,encoded_town,monroe township.encoded,west windsor.encoded
0,monroe township,2600,550000,0,1.0,0.0
1,monroe township,3000,565000,0,1.0,0.0
2,monroe township,3200,610000,0,1.0,0.0
3,monroe township,3600,680000,0,1.0,0.0
4,monroe township,4000,725000,0,1.0,0.0
5,west windsor,2600,585000,2,0.0,1.0
6,west windsor,2800,615000,2,0.0,1.0
7,west windsor,3300,650000,2,0.0,1.0
8,west windsor,3600,710000,2,0.0,1.0
9,robinsville,2600,575000,1,0.0,0.0


In [65]:
model = linear_model.LinearRegression()

In [67]:
model.fit(homeprices_original_df[["area","monroe township.encoded","west windsor.encoded"]].values, 
          homeprices_original_df[["price"]].values)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [68]:
model.predict([[2600,1,0]]) # actual is 550000

array([[539709.73984091]])

In [69]:
model.predict([[2600,0,1]]) # actual is 585000

array([[579723.71533005]])

In [70]:
model.predict([[2600,0,0]]) # actual is 575000

array([[565396.15136531]])

In [71]:
model.score(homeprices_original_df[["area","monroe township.encoded","west windsor.encoded"]].values, homeprices_original_df[["price"]].values)

0.9573929037221873