# Machine Learning

### Dummy Variables & One Hot Encoding

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('006.csv')

# following code gets a dataframe with dummies
dummies = pd.get_dummies(df.town)

# mergin the two dataframes
merged = pd.concat([df, dummies], axis='columns')

# Drop one dummy variable column
# avoid dummy variable trap
final_df = merged.drop(['town', 'west windsor'], axis='columns')

print(final_df)

    area   price  monroe township  robinsville
0   2600  550000                1            0
1   3000  565000                1            0
2   3200  610000                1            0
3   3600  680000                1            0
4   4000  725000                1            0
5   2600  585000                0            0
6   2800  615000                0            0
7   3300  650000                0            0
8   3600  710000                0            0
9   2600  575000                0            1
10  2900  600000                0            1
11  3100  620000                0            1
12  3600  695000                0            1


In [3]:
# Training the model using Dummy Variables
model = LinearRegression()
model.fit(final_df.drop('price', axis='columns'), final_df.price)

LinearRegression()

In [4]:
# Prediction
predict1 = model.predict([[2800, 0,1]])
predict2 = model.predict([[3400, 0, 0]])

# Model Accuracy
model_score = model.score(final_df.drop('price', axis='columns'), final_df.price)

print("Prediction 1 = ", predict1)
print("Prediction 2 = ", predict2)
print("Score = ", model_score)

Prediction 1 =  [590775.63964739]
Prediction 2 =  [681241.66845839]
Score =  0.9573929037221873


## Using OneHotEncoding and ColumnTransformer

#### Must use in case of ML

In [5]:
# One Hot Encoding
# It takes values from town and assigns an integer
# to same names

LE = LabelEncoder()
dfle = df
dfle.town = LE.fit_transform(dfle.town)
print(dfle)

    town  area   price
0      0  2600  550000
1      0  3000  565000
2      0  3200  610000
3      0  3600  680000
4      0  4000  725000
5      2  2600  585000
6      2  2800  615000
7      2  3300  650000
8      2  3600  710000
9      1  2600  575000
10     1  2900  600000
11     1  3100  620000
12     1  3600  695000


In [6]:
# gets the dataframe as a 2D array
x = dfle[['town', 'area']].values

# Categorieses the variable in the passed indices
ohe = OneHotEncoder(categories=[0])
ct = ColumnTransformer(
    [('one_hot_encoder', OneHotEncoder(categories='auto'), [0])],
    remainder='passthrough' 
)
x = ct.fit_transform(x)

#  Handling dummy variable trap
x = x[:, 1:]

# Training the model
model.fit(x, dfle.price)

LinearRegression()

In [9]:
# Tests
predict1 = model.predict([[1, 0, 2800]])
predict2 = model.predict([[0, 1, 3400]])
score = model.score(x, dfle.price)
print(predict1, predict2, score)

[590775.63964739] [681241.6684584] 0.9573929037221874
