Step 1. Create a random numerical DataFrame

In [1]:
import numpy as np
import pandas as pd 
df = pd.DataFrame(np.random.randn(10, 4), columns=["A", "B", "C", "D"])
df


Unnamed: 0,A,B,C,D
0,0.133871,-0.086118,1.169958,-0.617075
1,0.243578,-0.026799,0.23064,-0.704815
2,-1.083293,-1.820893,-0.404267,-1.867016
3,-1.261265,-0.696269,1.731437,-0.330824
4,-0.708383,0.138312,-2.50516,-2.889554
5,-1.172892,-2.025789,0.317631,0.574441
6,-0.282633,0.389787,0.109218,0.29228
7,-0.638847,-0.216694,0.274795,0.053104
8,-1.371074,1.466612,-2.069003,1.044805
9,1.389805,0.149972,-0.124442,-0.858169


Step 2. DataFrame is compatible with basic NumPy functions 

In [2]:
np.exp(df) #elementwise exponential 
np.sqrt(df) #elementwise square root

Unnamed: 0,A,B,C,D
0,0.365884,,1.081646,
1,0.493537,,0.48025,
2,,,,
3,,,1.315841,
4,,0.371904,,
5,,,0.563588,0.757919
6,,0.624329,0.330482,0.540629
7,,,0.524209,0.230443
8,,1.211038,,1.022157
9,1.1789,0.387262,,


Step 3. DataFrame to NumPy arrays 



In [3]:
df_all = np.asarray(df) #convert all the DF rows into array
df_all.shape
df_part = np.asarray(df[4:7]) #convert only 3 rows
df_part.shape 

(3, 4)

Step 4. Apply a simple sklearn function on NumPy array (here, we use a linear regression which is one of the most common regression models)


In [4]:
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3

# train a linear regressor
reg = LinearRegression().fit(X, y)

# display coefficient of the prediction
reg.score(X, y)

# predict a new sample
reg.predict(np.array([[3, 5]]))

array([16.])

Step 5. Save and load model

In [5]:
import pickle

#save our regressor in the current directory
pkl_filename = "regressor.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(reg, file)

#load from file
with open(pkl_filename, 'rb') as file:
    reg_checkpoint = pickle.load(file)

#should get same result as before
reg_checkpoint.predict(np.array([[3, 5]]))


array([16.])