In [None]:
from pyalink.alink import *
useLocalEnv(1)

from utils import *
import os
import pandas as pd

DATA_DIR = ROOT_DIR + "father_son" + os.sep

ORIGIN_FILE = "Pearson.txt";


In [None]:
source = CsvSourceBatchOp()\
    .setFilePath(DATA_DIR + ORIGIN_FILE)\
    .setSchemaStr("father double, son double")\
    .setFieldDelimiter("\t")\
    .setIgnoreFirstLine(True);

source.firstN(5).print();

In [None]:
import matplotlib.pyplot as plt

df_source = source.collectToDataframe()
plt.figure(figsize=(8, 8))
plt.scatter(df_source['father'], df_source['son'], color='blue', s=2)
plt.show()

In [None]:
source.lazyPrintStatistics();

BatchOperator.execute()

In [None]:
df_plus_one = source\
    .select("father, son, father+1 AS plus_one")\
    .collectToDataframe()
plt.figure(figsize=(8, 8))
plt.scatter(df_source['father'], df_source['son'], color='blue', s=2)
plt.plot(df_plus_one['father'], df_plus_one['plus_one'], color='grey', linewidth=2)
plt.show()

In [None]:
source.filter("father>=71.5 AND father<72.5").lazyPrintStatistics("father 72");

source.filter("father>=64.5 AND father<65.5").lazyPrintStatistics("father 65");

BatchOperator.execute()

In [None]:
linear_model = LinearRegTrainBatchOp()\
    .setFeatureCols(["father"])\
    .setLabelCol("son")\
    .linkFrom(source);

linear_model.lazyPrintTrainInfo();
linear_model.lazyPrintModelInfo();

linear_reg = LinearRegPredictBatchOp()\
    .setPredictionCol("linear_reg")\
    .linkFrom(linear_model, source);

linear_reg.lazyPrint(5);

BatchOperator.execute();

In [None]:
df_linear_reg = linear_reg.collectToDataframe()

plt.figure(figsize=(8, 8))
plt.scatter(df_source['father'], df_source['son'], color='blue', s=2)
plt.plot(df_plus_one['father'], df_plus_one['plus_one'], color='grey', linewidth=2)
plt.plot(df_linear_reg['father'], df_linear_reg['linear_reg'], color='red', linewidth=2)
plt.show()