# In Memory Regression

In [1]:
import pandas as pd
import numpy as np

def reg(pdDFrame, dependent) :                # Input : 데이터프레임. 반응변수 지정
    
    varName = pdDFrame.keys()                 # 변수명 추출
    y = pdDFrame[dependent].to_numpy()        # 반응변수를 np.array 형태로 변경

    xName = varName[varName != dependent]     # 반응변수를 제외한 독립변수 지정  
    X = pdDFrame[xName].to_numpy()            # 독립변수를 np.array 형태로 변경
    
    n, k = X.shape                            # n : 데이터 개수, k : 독립변수 개수
    
    intercept = np.ones((n,1))                # (n,1) 형태의 절편 생성
    X = np.append(intercept, X, axis=1)       # 독립변수에 절편이 있는 모형으로 추가
    
    xName = xName.insert(0, 'intercept')      # 절편에 변수명을 부여  
    
    XpX = X.T.dot(X)                          # X'X matrix
    XpXInv = np.linalg.inv(XpX)               # (X'X)^{-1} : inverse matrix of X'X
    Xpy = X.T.dot(y)                          # X'y
    
    parm = XpXInv.dot(Xpy)                    # B = (X'X)^{-1} X'y
    
    return (pd.DataFrame({"variable": xName, "Estimate": parm}))

data = pd.read_csv("./MLB.csv", sep=',', na_values=".", encoding='utf-8')
print(reg(data,"attendance"))

       variable       Estimate
0     intercept -104229.182238
1   runs.scored    2745.525726
2          wins    2599.805807
3  games.behind  -16036.985339


# Scikit-Learn Regression

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Data Loading
data = pd.read_csv("./MLB.csv", sep=',', na_values=".", encoding='utf-8')

def sklearn_reg(pdDFrame, dependent) : 

    # Traget and Input Variable setting
    X = pdDFrame[pdDFrame.columns[:-1]]
    y = pdDFrame[dependent]
    
    # Regression Fitting
    reg = LinearRegression().fit(X, y)
    
    # Estimator
    xName = pdDFrame.columns[:-1]
    coef = pd.DataFrame({"variable": xName, "Estimate": reg.coef_})
    intercept = pd.DataFrame({"variable": ["intercept"], "Estimate": reg.intercept_})
    
    return (pd.concat([intercept, coef]))

data = pd.read_csv("./MLB.csv", sep=',', na_values=".", encoding='utf-8')
print(sklearn_reg(data,"attendance"))

       variable       Estimate
0     intercept -104229.182238
0   runs.scored    2745.525726
1          wins    2599.805807
2  games.behind  -16036.985339


# Out of Core Regression

In [3]:
import pandas as pd
import numpy as np
import csv

def outofCore_Stat(file, dependent):
    
    f = open(file, "r", encoding = "utf-8")            
    rdr = csv.reader(f)
    varNames = np.array(next(rdr))   

    idxY = np.where(varNames == dependent)
    idxX = np.where(varNames != dependent)

    xNames = varNames[idxX]
    xNames = np.append("indercept", xNames)
    
    k = len(xNames)
    
    xpx = np.zeros((k,k))
    xpy = np.zeros(k)
    
    f.close()                                          

    return 0

file = "./MLB.csv"
outofCore_Stat(file, "attendance")

0