In [2]:
import numpy as np
import pandas as pd
import urllib.request as req
from sklearn import linear_model
import matplotlib.pyplot as plt
%matplotlib inline
from scipy import stats

def average(list):
    return np.sum(list)/len(list)
# 分散を求める関数
def variance(list):
    mu = average(list)
    diff = np.array(list) - mu
    return np.sum(np.square(diff))/len(list)
# 標準偏差を求める関数
def standard_deviation(list):
    return np.sqrt(variance(list))
# 共分散 = 偏差積の平均 （偏差値、ではありません。偏差積、です）
def covariance(list1, list2):
    mu1,mu2 = average(list1), average(list2)
    diff1, diff2 = np.array(list1) - mu1, np.array(list2) - mu2
    return np.sum(diff1 * diff2)/len(list1)
# 相関係数 = 共分散を list1, list2 の標準偏差で割ったもの
def correlation(list1, list2):
    return covariance(list1,list2)/(standard_deviation(list1)*standard_deviation(list2))

# 回帰直線の傾き＝相関係数＊（（yの標準偏差）／（xの標準偏差））を求める関数を作ってください。
def a_fit(xlist, ylist):
    return correlation(xlist,ylist) * (standard_deviation(ylist)/standard_deviation(xlist))
# y切片＝yの平均－（傾き＊xの平均）を求める関数を作ってください。
def b_fit(xlist, ylist):
    return average(ylist) - (a_fit(xlist,ylist) * average(xlist))
# a の影響を除いた、b と y の偏相関係数 partial correlation coefficient を求める関数
def partial_correlation(a, b, y):
    def cor(m,n):
        return correlation(m,n)
    return (cor(b,y) - (cor(a,y) * cor(a,b)))/np.sqrt((1-cor(a,y)**2)*(1-cor(a,b)**2))

def standard_partial_regression(a, b, y):
    rby = correlation(b,y)
    rab = correlation(a,b)
    ray = correlation(a,y)
    return (rby - (ray * rab))/ (1 - rab**2)
def multiple_correlation(a, b, y):
    rby = correlation(b,y)
    rab = correlation(a,b)
    ray = correlation(a,y)
    return np.sqrt((ray**2 + rby**2 - 2 * ray * rby * rab) / (1 - rab**2))



# カイ２乗 and 検定

In [53]:
#1 jigen
significance = 0.05
def d1chi2(o,e):
    #o = [17, 10, 6, 7, 15, 5] # 実測値（観測度数）
    #e = [10, 10, 10, 10, 10, 10] # 理論値（期待度数）

    chi2, p = stats.chisquare(o, f_exp = e)

    print( "chi2 値は %(chi2)s" %locals() )
    print( "確率は %(p)s" %locals() )

    if p < significance:
        print("有意水準 %(significance)s で、有意な差があります" %locals())
    else:
        print("有意水準 %(significance)s で、有意な差がありません" %locals())

#2 jigen
def d2chi2(observed):
    significance = 0.05 # 有意水準
    #observed = [[24,18], [8, 18]] # 実測値（観測度数）
    expected = stats.contingency.expected_freq(observed) # 理論値（期待度数）
    o = [item for sublist in observed for item in sublist] # 平滑化した実測値（観測度数）
    e = [item for sublist in expected for item in sublist] # 平滑化した理論値（期待度数）
    dof = (len(observed) - 1) * (len(observed[0]) - 1) # 自由度

    chi2, p = stats.chisquare(o, f_exp = e, ddof = dof)

    print( "chi2 値は %(chi2)s" %locals() )
    print( "p 値は %(p)s" %locals() )

    if p < significance:
        print("有意水準 %(significance)s で、有意な差があります" %locals())
    else:
        print("有意水準 %(significance)s で、有意な差がありません" %locals())


def tnashi(momo,sakura):
    t, p = stats.ttest_ind(momo,sakura)

    print( "t 値は %(t).3f" %locals() )
    print( "確率は %(p)s" %locals() )

    if p < significance:
        print("有意水準 %(significance)s で、有意な差があります" %locals())
    else:
        print("有意水準 %(significance)s で、有意な差がありません" %locals())
        
def tari(bef,aft):
    t, p = stats.ttest_rel(bef,aft)

    print( "t 値は %(t).3f" %locals() )
    print( "確率は %(p)s" %locals() )

    if p < significance:
        print("有意水準 %(significance)s で、有意な差があります" %locals())
    else:
        print("有意水準 %(significance)s で、有意な差がありません" %locals())

def fkentei(ori, old, new):
    f, p = stats.f_oneway(ori,old,new)

    print( "f 値は %(f)s" %locals() )
    print( "確率は %(p)s" %locals() )

    if p < significance:
        print("有意水準 %(significance)s で、有意な差があります" %locals())
    else:
        print("有意水準 %(significance)s で、有意な差がありません" %locals())

In [47]:
def linear_regress_taiou(X,y,name):
    model = linear_model.LinearRegression()
    model.fit(X,y)
    X_col = X.columns
    coef = model.coef_
    intercept = model.intercept_
    out = '' + name + '='
    for i,j in zip(X_col, coef):
        out += '{:.6f}*{}+'.format(j,i)
    out += '{:.6f}'.format(intercept)
    print(out)

In [48]:
url = 'https://raw.githubusercontent.com/maskot1977/ipython_notebook/master/toydata/entrance_exam2.txt'
req.urlretrieve(url,'data.txt')
df = pd.read_csv('data.txt','\s+',usecols=range(1,4))

In [49]:
df.columns = ['test', 'grade', 'high']
X = df.drop('grade',1)
y = df['grade']

In [50]:
X.columns

Index(['test', 'high'], dtype='object')

In [51]:
linear_regress_taiou(X,y,'grade')

grade=0.006935*test+0.128121*high+-2.094111


In [54]:
d2chi2([[24,18], [8, 18]])

chi2 値は 4.48351648352
p 値は 0.106271489049
有意水準 0.05 で、有意な差がありません
