In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
sns.set()

%matplotlib inline
mpl.rcParams['figure.figsize']=(10.0, 6)
mpl.rcParams['font.size']=14
mpl.rcParams['savefig.dpi']=90
mpl.rcParams['figure.subplot.bottom']=.1

In [26]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Categorical Features

In [28]:
data = [
    {'price': 850000, 'rooms': 4, 'neighborhood': 'Queen Anne'},
    {'price': 700000, 'rooms': 3, 'neighborhood': 'Fremont'},
    {'price': 650000, 'rooms': 3, 'neighborhood': 'Wallingford'},
    {'price': 600000, 'rooms': 2, 'neighborhood': 'Fremont'}
]
categorical_df = pd.DataFrame(data)
categorical_df

Unnamed: 0,neighborhood,price,rooms
0,Queen Anne,850000,4
1,Fremont,700000,3
2,Wallingford,650000,3
3,Fremont,600000,2


In [39]:
vec = DictVectorizer(sparse=False)
d = vec.fit_transform(data)
one_hot_df = pd.DataFrame(d, columns=vec.get_feature_names())
one_hot_df

Unnamed: 0,neighborhood=Fremont,neighborhood=Queen Anne,neighborhood=Wallingford,price,rooms
0,0.0,1.0,0.0,850000.0,4.0
1,1.0,0.0,0.0,700000.0,3.0
2,0.0,0.0,1.0,650000.0,3.0
3,1.0,0.0,0.0,600000.0,2.0


# Text Features

In [24]:
sample = ['problem of evil', 'evil queen', 'horizon problem', 'horibble boss', 
          'beautiful queen', 'the mask', 'evil mask']
vec = TfidfVectorizer()
d = vec.fit_transform(sample)
df = pd.DataFrame(d.toarray(), columns=vec.get_feature_names())
df

Unnamed: 0,beautiful,boss,evil,horibble,horizon,mask,of,problem,queen,the
0,0.0,0.0,0.479185,0.0,0.0,0.0,0.675356,0.560603,0.0,0.0
1,0.0,0.0,0.64975,0.0,0.0,0.0,0.0,0.0,0.760148,0.0
2,0.0,0.0,0.0,0.0,0.769449,0.0,0.0,0.638709,0.0,0.0
3,0.0,0.707107,0.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0
4,0.769449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.638709,0.0
5,0.0,0.0,0.0,0.0,0.0,0.638709,0.0,0.0,0.0,0.769449
6,0.0,0.0,0.64975,0.0,0.0,0.760148,0.0,0.0,0.0,0.0


In [40]:
vec = CountVectorizer()
d = vec.fit_transform(sample)
df = pd.DataFrame(d.toarray(), columns=vec.get_feature_names())
df

Unnamed: 0,beautiful,boss,evil,horibble,horizon,mask,of,problem,queen,the
0,0,0,1,0,0,0,1,1,0,0
1,0,0,1,0,0,0,0,0,1,0
2,0,0,0,0,1,0,0,1,0,0
3,0,1,0,1,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,1,0
5,0,0,0,0,0,1,0,0,0,1
6,0,0,1,0,0,1,0,0,0,0


# Imputation of Missing Data

In [59]:
from numpy import nan
from sklearn.preprocessing import Imputer

x = np.array([[ nan, 0,   3  ],
              [ 3,   7,   9  ],
              [ 3,   5,   2  ],
              [ 4,   nan, 6  ],
              [ 8,   8,   1  ]])
y = np.array([14.0, 16.0, -1.0,  8.0, -5.0])
imp = Imputer(strategy='mean')
imp.fit_transform(x)

array([[ 4.5,  0. ,  3. ],
       [ 3. ,  7. ,  9. ],
       [ 3. ,  5. ,  2. ],
       [ 4. ,  5. ,  6. ],
       [ 8. ,  8. ,  1. ]])

# Feature Pipelines

In [70]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score

pipeline = make_pipeline(
    Imputer(strategy='mean'),
    PolynomialFeatures(degree=2),
    LinearRegression(normalize=True)
)
r2_score(y, pipeline.fit(x, y).predict(x))

1.0