# In Memory Regression

In [1]:
import pandas as pd
import numpy as np

def reg(pdDFrame, dependent) :                # Input : 데이터프레임. 반응변수 지정
    
    varName = pdDFrame.keys()                 # 변수명 추출
    y = pdDFrame[dependent].to_numpy()        # 반응변수를 np.array 형태로 변경

    xName = varName[varName != dependent]     # 반응변수를 제외한 독립변수 지정  
    X = pdDFrame[xName].to_numpy()            # 독립변수를 np.array 형태로 변경
    
    n, k = X.shape                            # n : 데이터 개수, k : 독립변수 개수
    
    intercept = np.ones((n,1))                # (n,1) 형태의 절편 생성
    X = np.append(intercept, X, axis=1)       # 독립변수에 절편이 있는 모형으로 추가
    
    xName = xName.insert(0, 'intercept')      # 절편에 변수명을 부여  
    
    XpX = X.T.dot(X)                          # X'X matrix
    XpXInv = np.linalg.inv(XpX)               # (X'X)^{-1} : inverse matrix of X'X
    Xpy = X.T.dot(y)                          # X'y
    
    parm = XpXInv.dot(Xpy)                    # B = (X'X)^{-1} X'y
    
    return (pd.DataFrame({"variable": xName, "Estimate": parm}))

data = pd.read_csv("./MLB.csv", sep=',', na_values=".", encoding='utf-8')
print(reg(data,"attendance"))

       variable       Estimate
0     intercept -104229.182238
1   runs.scored    2745.525726
2          wins    2599.805807
3  games.behind  -16036.985339


# Scikit-Learn Regression

In [2]:
import pandas as pd
from sklearn.linear_model import LinearRegression

# Data Loading
data = pd.read_csv("./MLB.csv", sep=',', na_values=".", encoding='utf-8')

def sklearn_reg(pdDFrame, dependent) : 

    # Traget and Input Variable setting
    X = pdDFrame[pdDFrame.columns[:-1]]
    y = pdDFrame[dependent]
    
    # Regression Fitting
    reg = LinearRegression().fit(X, y)
    
    # Estimator
    xName = pdDFrame.columns[:-1]
    coef = pd.DataFrame({"variable": xName, "Estimate": reg.coef_})
    intercept = pd.DataFrame({"variable": ["intercept"], "Estimate": reg.intercept_})
    
    return (pd.concat([intercept, coef]))

data = pd.read_csv("./MLB.csv", sep=',', na_values=".", encoding='utf-8')
print(sklearn_reg(data,"attendance"))

       variable       Estimate
0     intercept -104229.182238
0   runs.scored    2745.525726
1          wins    2599.805807
2  games.behind  -16036.985339


# Out of Core Regression

In [3]:
import pandas as pd
import numpy as np
import csv

def outofCore_Stat(file, dependent):
    
    f = open(file, "r", encoding = "utf-8")            
    rdr = csv.reader(f)
    varNames = np.array(next(rdr))   

    idxY = np.where(varNames == dependent)
    idxX = np.where(varNames != dependent)

    xNames = varNames[idxX]
    xNames = np.append("indercept", xNames)
    
    k = len(xNames)
    
    xpx = np.zeros((k,k))
    xpy = np.zeros(k)
    
    f.close()                                          

    return 0

file = "./MLB.csv"
outofCore_Stat(file, "attendance")

0

In [4]:
import urllib
import requests, io, os
import numpy as np
import tarfile, zipfile, gzip
def unzip_from_UCI(UCI_url, dest=''):
    # Downloads and unpacks datasets from UCI in zip format
    response = requests.get(UCI_url)
    compressed_file = io.BytesIO(response.content)
    z = zipfile.ZipFile(compressed_file)
    print ('Extracting in %s' % os.getcwd()+os.sep+dest)
    for name in z.namelist():
        if '.csv' in name:
            print ('\tunzipping %s' %name)
            z.extract(name, path=os.getcwd()+os.sep+dest)

In [5]:
def gzip_from_UCI(UCI_url, dest=''):
    # Downloads and unpacks datasets from UCI in gzip format
    response = urllib.request.urlopen(UCI_url)
    compressed_file = io.BytesIO(response.read())
    decompressed_file = gzip.GzipFile(fileobj = compressed_file)
    filename = UCI_url.split('/')[-1][:-3]
    with open(os.getcwd()+os.sep+filename, 'wb') as outfile:
        outfile.write(decompressed_file.read())
        print ('File %s decompressed' % filename)

In [6]:
UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip'
unzip_from_UCI(UCI_url, dest='bikesharing')
UCI_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz'
gzip_from_UCI(UCI_url)

Extracting in D:\Bachelor of Statistics\2019 - 02\[응용통계학과] 빅데이터 분석 활용\bikesharing
	unzipping day.csv
	unzipping hour.csv
File covtype.data decompressed


In [7]:
import pandas as pd
day = pd.read_csv("./bikesharing/day.csv")
day

Unnamed: 0,instant,dteday,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,6,0,2,0.344167,0.363625,0.805833,0.160446,331,654,985
1,2,2011-01-02,1,0,1,0,0,0,2,0.363478,0.353739,0.696087,0.248539,131,670,801
2,3,2011-01-03,1,0,1,0,1,1,1,0.196364,0.189405,0.437273,0.248309,120,1229,1349
3,4,2011-01-04,1,0,1,0,2,1,1,0.200000,0.212122,0.590435,0.160296,108,1454,1562
4,5,2011-01-05,1,0,1,0,3,1,1,0.226957,0.229270,0.436957,0.186900,82,1518,1600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
726,727,2012-12-27,1,1,12,0,4,1,2,0.254167,0.226642,0.652917,0.350133,247,1867,2114
727,728,2012-12-28,1,1,12,0,5,1,2,0.253333,0.255046,0.590000,0.155471,644,2451,3095
728,729,2012-12-29,1,1,12,0,6,0,2,0.253333,0.242400,0.752917,0.124383,159,1182,1341
729,730,2012-12-30,1,1,12,0,0,0,1,0.255833,0.231700,0.483333,0.350754,364,1432,1796


In [8]:
import pandas as pd
hour = pd.read_csv("./bikesharing/hour.csv")
hour

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,17375,2012-12-31,1,1,12,19,0,1,1,2,0.26,0.2576,0.60,0.1642,11,108,119
17375,17376,2012-12-31,1,1,12,20,0,1,1,2,0.26,0.2576,0.60,0.1642,8,81,89
17376,17377,2012-12-31,1,1,12,21,0,1,1,1,0.26,0.2576,0.60,0.1642,7,83,90
17377,17378,2012-12-31,1,1,12,22,0,1,1,1,0.26,0.2727,0.56,0.1343,13,48,61
