In [1]:
import pandas as pd
import numpy as np
from sklearn import linear_model

In [2]:
df=pd.DataFrame({
    "town":["monroe","monroe","west windsor","west windsor","robbinsvile","robbinsvile"],
    "area":[2600,3000,2600,2800,2600,2900],
    "price":[550000,565000,585000,615000,575000,600000]
})
df

Unnamed: 0,town,area,price
0,monroe,2600,550000
1,monroe,3000,565000
2,west windsor,2600,585000
3,west windsor,2800,615000
4,robbinsvile,2600,575000
5,robbinsvile,2900,600000


In [3]:
dummies=pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe,robbinsvile,west windsor
0,1,0,0
1,1,0,0
2,0,0,1
3,0,0,1
4,0,1,0
5,0,1,0


In [4]:
merged=pd.concat([df,dummies],axis=1)
merged

Unnamed: 0,town,area,price,monroe,robbinsvile,west windsor
0,monroe,2600,550000,1,0,0
1,monroe,3000,565000,1,0,0
2,west windsor,2600,585000,0,0,1
3,west windsor,2800,615000,0,0,1
4,robbinsvile,2600,575000,0,1,0
5,robbinsvile,2900,600000,0,1,0


In [5]:
final=merged.drop(["town","west windsor"],axis=1)
final

Unnamed: 0,area,price,monroe,robbinsvile
0,2600,550000,1,0
1,3000,565000,1,0
2,2600,585000,0,0
3,2800,615000,0,0
4,2600,575000,0,1
5,2900,600000,0,1


In [6]:
reg=linear_model.LogisticRegression()
reg

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [7]:
x=final.drop("price",axis="columns")
x

Unnamed: 0,area,monroe,robbinsvile
0,2600,1,0
1,3000,1,0
2,2600,0,0
3,2800,0,0
4,2600,0,1
5,2900,0,1


In [8]:
y=final.price
y

0    550000
1    565000
2    585000
3    615000
4    575000
5    600000
Name: price, dtype: int64

In [9]:
reg.fit(x,y)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [10]:
reg.predict([[2800,0,1]])

array([600000], dtype=int64)

In [11]:
reg.predict([[3200,0,0]])

array([615000], dtype=int64)

In [12]:
reg.score(x,y)

0.9211748915961173

In [13]:
df

Unnamed: 0,town,area,price
0,monroe,2600,550000
1,monroe,3000,565000
2,west windsor,2600,585000
3,west windsor,2800,615000
4,robbinsvile,2600,575000
5,robbinsvile,2900,600000


In [14]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

In [15]:
dfle=df
dfle.town=le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,2,2600,585000
3,2,2800,615000
4,1,2600,575000
5,1,2900,600000


In [16]:
x=dfle[["town","area"]].values
x

array([[   0, 2600],
       [   0, 3000],
       [   2, 2600],
       [   2, 2800],
       [   1, 2600],
       [   1, 2900]], dtype=int64)

In [17]:
y=dfle.price
y

0    550000
1    565000
2    585000
3    615000
4    575000
5    600000
Name: price, dtype: int64

In [18]:
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(categorical_features=[0])

In [19]:
x=ohe.fit_transform(x).toarray()
x

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03]])

In [20]:
x=x[:,1:]
x

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03]])

In [21]:
reg.fit(x,y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [22]:
reg.predict([[0,1,3200]])

array([633620.68965517])